Skip to content

Instantly share code, notes, and snippets.

@joegr
Last active March 30, 2024 20:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joegr/45a47c10ea23cb5a393cbd5d29e5161a to your computer and use it in GitHub Desktop.
Save joegr/45a47c10ea23cb5a393cbd5d29e5161a to your computer and use it in GitHub Desktop.
Big Little Language Model Code
class Entity:
def __init__(self, name, desc):
self.id = uuid4()
self.name = name
self.desc = desc
def __str__(self):
return self.id
def __repr__(self):
return self.id
def _get(self):
return self.id, self.name, self.desc
from collections import deque
def list_base_tokens():
res = []
with open(words) as f:
txt = f.read()
lines = txt.split()
for line in lines:
res.append(line)
return res
class KB:
def __init__(self, vocab=list_base_tokens()):
self.pool_lim = 100
self.vx = CountVectorizer(analyzer="word", ngram_range=(6,6))
self.I = {} # entid : Entname
self.nm = {} # ENTNAME : CVMatrix
self.vocab = {} # token: count all tokens uniquely mentioned in training descriptions
self.pmem = {} # set of 1st level aliases, ie EntID:set()
self.smem = {} # set of 2nd level aliases, ie EntID:set()
words = list_base_tokens()
for word in list_base_tokens():
self.vocab[word] = 0
def add(self, ent):
i, n, d = ent._get()
print(i, n, d)
if n != "" and d != "":
new_toks = word_tokenize("".join([n,d]))
print(new_toks)
if new_toks != "":
for token in new_toks:
if token in stops:
pass
elif token in self.vocab.keys():
self.vocab[token] = self.vocab[token] + 1
else:
self.vocab[token] = 0
#index resolution
if i not in self.I.keys():
self.I[i] = n
else:
print("entid exists")
#name matrix creation
#if n not in self.nm:
#self.nm[n] = self.vx.fit_transform([n,d])
else:
pass
def addL1_alias(self, entid, a):
#confirm entity exists:
if self.idcheck(entid):
#confirm memory is not full
self.memcheck()
#confirm alias not already in memory and add if not
if a not in np.array(self.pmem.values()).flatten():
self.pmem[entid] = set()
self.pmem[entid].add(a)
return ( "alias does not exist adding", a, " to ent ", entid )
else:
entid = self.find(a)
return "existing alias in memory tied to different entid ", entid
else:
print("entid does not exist, please add ent before alias")
def addL2_alias(self, entid, a):
if self.idcheck(entid):
self.memcheck()
if a not in np.array(self.smem.values).flatten():
self.smem[entid] = set()
self.smem[entid].add(a)
return str(self.smem)
else:
return False
def learn_ents_from_csv(self, path):
df= pd.read_csv(path)
df= df.fillna("")
for tup in df.itertuples():
d = tup[2]
n = tup[1]
ent = Entity(n, d)
self.add(ent)
return "learned ", df.head()
def ents(self):
return str(self.I)
def aliases(self):
return str(list(self.pmem.values())+list(self.smem.values()))
def learn_aliases_from_csv(self, path):
df = pd.read_csv(path)
df.fillna("")
for tup in df.itertuples():
entid = tup[0]
alias = tup[1]
try:
self.addL1_alias(entid, alias)
except Exception as e:
try:
self.addL2_alias(entid, alias)
except Exception as ex:
return e, ex
return "learned, ", df.rows()
def summary(self):
return "In Memory Knowledge Base initialized on Google's first 10,000 words and MDD linkages"
def __repr__(self):
plt.plot(np.array(list(self.vocab.values())))
res = {}
for k, v in self.vocab.items():
if v > 0:
res[k] = v
else:
pass
return str(res)
def memcheck(self, l2=False):
if l2:
for eid in self.smem.keys():
if len(self.smem[eid]) < self.pool_lim:
pass
else:
return False, "ERR: alias secondary memory full at", eid
else:
for eid in self.pmem.keys():
if len(self.pmem[eid]) < self.pool_lim:
pass
else:
return False, "ERR: alias primary memory full at", eid
def find(self, string):
# try immediate primary memory "edge" relationships
try:
return self.pmem.get(string)
except Exception as e:
print(e)
return "Token not found in primary alias memory"
# try secondary memory "edge" relationships
try:
return self.smem.get(string)
except Exception as e:
print(e)
return "Token not found in secondary alias memory"
def idcheck(self, entid):
return True if entid in self.I.keys() else False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment