-
-
Save avidale/c6b19687d333655da483421880441950 to your computer and use it in GitHub Desktop.
I checked it manually it is not working properly, the date and title do not match with para. Can you please illustrate it with the codes.?
try this : change the output of the function query by adding the indexes of the neighbors :
def query(self, query_sent, query_word, k=10, filter_same_word=False):
toks, embs = self.model([query_sent])[0]
found = False
for tok, emb in zip(toks, embs):
if tok == query_word:
found = True
break
if not found:
raise ValueError('The query word {} is not a single token in sentence {}'.format(query_word, toks))
emb = emb / sum(emb**2)**0.5
if filter_same_word:
initial_k = max(k, 100)
else:
initial_k = k
di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)
distances = []
neighbors = []
contexts = []
for i, index in enumerate(idx.ravel()):
token = self.all_tokens[index]
if filter_same_word and (query_word in token or token in query_word):
continue
distances.append(di.ravel()[i])
neighbors.append(token)
contexts.append(self.sentences[self.sentence_ids[index]])
if len(distances) == k:
break
indexes =idx.ravel()
return distances, neighbors, contexts,indexes
then in your function : you remose date from zip() and replace it by the indexes , then you iterate over the date list using the indexes
like this :
distances, neighbors, contexts,indexes = storage.query(query_sent='It is an investment bank.', query_word='bank', k=50)
dd = []
date = df["date"].tolist()
for d, w, c , idx in zip(distances, neighbors, contexts,indexes):
dd.append(
{'date':date[idx],
'neigh' : w,
'score':d,
'para' : c.strip()
})
ad =pd.DataFrame(dd)
Also, I tried to make it but the error is the same
distances, neighbors, contexts, indexes = storage.query(query_sent='It is an investment bank.', query_word='bank', k=50)
dd = []
#date = df["date"].tolist()
for d, w, c , idx in zip(distances, neighbors, contexts, indexes):
row_dic = df2.loc[df2.index==lines[idx]].to_dict()
dd.append(
{'date':row_dict["date"][ lines[idx]],
'neigh' : w,
'score': d,
'para' : c.strip()
})
ad = pd.DataFrame(dd)
Hello AsmaZbt, I would really appreciate your input. the previous error is fixed but I'm getting a new error, I could not figure the issues despite spending many hours.
given as
after running these codes
distances, neighbors, contexts,indexes = storage.query(query_sent='It is an investment bank.', query_word='bank', k=5)
dd = []
#date = df["date"].astype(int)
date = df["date"].tolist()
for d, w, c , idx in zip(distances, neighbors, contexts,indexes):
dd.append(
{'date':date[idx],
'neigh' : w,
'score':d,
'para' : c.strip()
})
ad =pd.DataFrame(dd)
Here are the complete data processing steps:
df = pd.read_csv("/content/df3.csv")
df = df.set_index("content")
df.head(1)
text_dict = df.to_dict()
len_text = len(text_dict["date"])
df = df["date"].to_dict()
df_sentences_list = list(df.keys())
len(df_sentences_list)
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
file_content = "\n".join(df_sentences_list)
with open("input_text.txt","w") as f:
f.write(file_content)
with open("/content/input_text.txt","r") as f:
lines1 = f.readlines()
Then I run all the required parts of the codes. And read the data again before running the final query codes
distances, neighbors, contexts,indexes = storage.query(query_sent='It is an investment bank.', query_word='bank', k=5)
dd = []
#date = df["date"].astype(int)
date = df["date"].tolist()
for d, w, c , idx in zip(distances, neighbors, contexts,indexes):
dd.append(
{'date':date[idx],
'neigh' : w,
'score':d,
'para' : c.strip()
})
ad =pd.DataFrame(dd)
But I get the error as given above.
hello @Shafi2016
could you please print idx ? to see the type of this variable
Very strangely when using print(idx) inside loop I can not get anything out of it. However, when use the outside loop I get 10122.
could you please share your function query ?
` def query(self, query_sent, query_word, k=10, filter_same_word=False):
toks, embs = self.model([query_sent])[0]
found = False
for tok, emb in zip(toks, embs):
if tok == query_word:
found = True
break
if not found:
raise ValueError('The query word {} is not a single token in sentence {}'.format(query_word, toks))
emb = emb / sum(emb**2)**0.5
if filter_same_word:
initial_k = max(k, 100)
else:
initial_k = k
di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)
distances = []
neighbors = []
contexts = []
for i, index in enumerate(idx.ravel()):
token = self.all_tokens[index]
if filter_same_word and (query_word in token or token in query_word):
continue
distances.append(di.ravel()[i])
neighbors.append(token)
contexts.append(self.sentences[self.sentence_ids[index]])
#indexes =idx.ravel()
if len(distances) == k:
break
indexes =idx.ravel()
return distances, neighbors, contexts,indexes
print(idx)`
indexes must be outside the loop for like this
for i, index in enumerate(idx.ravel()):
token = self.all_tokens[index]
if filter_same_word and (query_word in token or token in query_word):
continue
distances.append(di.ravel()[i])
neighbors.append(token)
contexts.append(self.sentences[self.sentence_ids[index]])
#indexes =idx.ravel()
if len(distances) == k:
break
indexes =idx.ravel() ########## indexes must but out the loop for
return distances, neighbors, contexts,indexes
print(idx)`
`for i, index in enumerate(idx.ravel()):
token = self.all_tokens[index]
if filter_same_word and (query_word in token or token in query_word):
continue
distances.append(di.ravel()[i])
neighbors.append(token)
contexts.append(self.sentences[self.sentence_ids[index]])
#indexes =idx.ravel()
if len(distances) == k:
break
indexes =idx.ravel()
return distances, neighbors, contexts,indexes
print(idx)``
you need to write it in the same indentation as return
you make a break this means stop so you can't do any thing more with this condition so you return
you just need to get the list of indexes there is no change no condition you can add it att the end or just after this instruction if you did not understand it well :
di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)
indexes = idx.ravel()
distances = []
then you delete it from the end
or you do it at the end as I mentioned
not at all , you're welcome
try to print
len(date)
print(idx) inside the for
the length of the liste date is 8804 and you have an index of 10759 that's why you have in out of index .
i need to think
do you have a two file ? .txt and .csv ? then may be the length of list of sentences is not the same as your csv file.
i thought that your sentences are from the same CSV file so the length of date and para are the same
but now I'm sorry I can't help you
you need information how to link that sentences and dates ( for each sentence the appropriate date ) then you can solve the pb
It is the same CSV file. I first convert, CSV to list and text as in the original example.
df = pd.read_csv("/content/df3.csv",parse_dates= True)
df = df.set_index("content")
df.head(1)
text_dict = df.to_dict()
len_text = len(text_dict["date"])
df = df["date"].to_dict()
df_sentences_list = list(df.keys())
len(df_sentences_list)
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
file_content = "\n".join(df_sentences_list)
with open("input_text.txt","w") as f:
f.write(file_content)
with open("/content/input_text.txt","r") as f:
lines1 = f.readlines()
lines1[0]
all_sentences = [l.split('\t')[0] for l in lines1]
Again for the date, We use the same csv file, I only distable this part df = df.set_index("content")
If I can have your email Id I will send refine codes with small sample data
Thanks a lot!!!
@avidale Hi thanks for the amazing work.. I want to implement that, but need few points to understand..
- I want it to work for large dataset, can it handle it or need to implement something else on it?
- After getting embeddings, can I save it so next time I just load the data and get result by query?
@sridhardev07 yes and yes
@sridhardev07 yes and yes
Can you tell me how? that will be really helpful for me!!
Hi @avidale I tried this with some bigger dataset to test the accuracy. Dataset having sentences about 37126, it is showing me memory error: numpy.core._exceptions.MemoryError: Unable to allocate 2.35 GiB for an array with shape (819827, 768) and data type float32
I am having 16GB of RAM, can you tell any alternate way to do, which uses less RAM or retrieve the data from the disk??
Hi @sridhardev07!
The simplest trick I could suggest is to convert all vectors from float32 to float16, this will reduce memory requirements by half without significantly affecting the quality.
If this does not suffice, you could look at https://github.com/facebookresearch/faiss - a library for fast vector similarity search that allegedly can work with very large sets. Specifically, they implement product quantization for lossy compression of the vectors. If you choose to use Faiss, you should rewrite my solution: unite process_sentences
and build_search_index
that processes the sentences incrementally and adds their vectors to a faiss.IndexIVFPQ
instead of a KDTree
.
Hi @avidale ! Thanks for the answer!
I tried converting the vectors to float16 it does help to reduce the size but not that much as I am working with large dataset.
I tried the second approach of Faiss, it worked good when I tried Flat index, so I can add the index incrementally. But on saving that to disk taking lots of storage. Approx 1 GB of 15K sentences. here is what I did:
def __init__(self, sentences, model):
self.sentences = sentences
self.model = model
self.index = faiss.IndexFlatL2(768)
def process_sentences(self):
result = self.model(self.sentences)
self.sentence_ids = []
self.token_ids = []
self.all_tokens = []
for i, (toks, embs) in enumerate(tqdm(result)):
# initialize all_embeddings for every new sentence
all_embeddings = []
for j, (tok, emb) in enumerate(zip(toks, embs)):
self.sentence_ids.append(i)
self.token_ids.append(j)
self.all_tokens.append(tok)
all_embeddings.append(emb)
all_embeddings = np.stack(all_embeddings) # Add embeddings after every sentence
self.index.add(all_embeddings)
faiss.write_index(self.index, "faiss_Model")
Then I tried with faiss.IndexIVFPQ, it works good, but did not works for incremental index as it needs the training data too. So need to calculate all the embeddings and then train and add. Again the size is small but its taking too much RAM that is causing issue while working with large data. here is what I did:
def __init__(self, sentences, model):
self.sentences = sentences
self.model = model
self.quantizer = faiss.IndexFlatL2(768)
self.index = faiss.IndexIVFPQ(self.quantizer, 768, 1000, 16, 8)
def process_sentences(self):
result = self.model(self.sentences)
self.sentence_ids = []
self.token_ids = []
self.all_tokens = []
all_embeddings = []
for i, (toks, embs) in enumerate(tqdm(result)):
for j, (tok, emb) in enumerate(zip(toks, embs)):
self.sentence_ids.append(i)
self.token_ids.append(j)
self.all_tokens.append(tok)
all_embeddings.append(emb)
all_embeddings = np.stack(all_embeddings)
self.index.train(all_embeddings) # Train
self.index.add(all_embeddings) # Add to index
faiss.write_index(self.index, "faiss_Model_mini")
Okey thanks once again! I shall change, name of "date".Probably I will add the title to it as well. Then I can check a few manually to see if everything is going well.