Skip to content

Instantly share code, notes, and snippets.

@darkacorn
Created February 13, 2024 22:26
Show Gist options
  • Save darkacorn/f786564868357cde5894ef6e2c6f64cf to your computer and use it in GitHub Desktop.
Save darkacorn/f786564868357cde5894ef6e2c6f64cf to your computer and use it in GitHub Desktop.
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
model = SentenceTransformer('TaylorAI/bge-micro')
import os
import re
input_folder = "/home/o_0/Downloads/www.asstr.org/files/Collections/Kristen's_Collection/Book_Shelf_A/"
output_folder = "/aimodels/scrapehappy/cleaned_storys/"
import os
path = input_folder
all_files = os.listdir(path)
csv_files = list(filter(lambda f:f.endswith('.txt'), all_files))
def remove_strings(lst):
result = []
for element in lst:
alphabets = sum(c.isalpha() for c in element)
non_alphabets = sum(not c.isalpha() for c in element)
if non_alphabets <= alphabets:
result.append(element)
return result
def end_trunc(lst):
result = []
for i in range(len(lst)):
if re.search("Kristen's collection", sd[i].strip()):
result.extend(lst[i+1:])
break
else:
result=remove_strings(lst)
return result
def front_trunc(lst):
result = []
for i in range(len(lst)):
text=" ".join(sd[i].split())
if re.search("BECAUSE I DON'T CENSER MY COLLECTING",text) or re.search('FROM THE STORIES YOU READ',text) or re.search('This work is copyrighted to the author',text) or re.search('This story is copyrighted',text) :
result.extend(lst[i+1:])
break
if len(result)==0:
result=front_trunc_emb(lst)
return result
def front_trunc_emb(result):
my_list=result[0:10]
embeddings = model.encode(my_list)
compare_element = "This work is copyrighted to the author"
compare_embedding = model.encode([compare_element])
cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0]
closest_element_index = np.argmax(cosine_similarities)
return result[closest_element_index+1:]
def end_trunc_emb(result):
my_list=result[0:10]
embeddings = model.encode(my_list)
compare_element = "Please keep this story,and all erotic stories out of the hand of children"
compare_embedding = model.encode([compare_element])
cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0]
closest_element_index = np.argmax(cosine_similarities)
return result[:1+closest_element_index]
x = 0
for idx, i in enumerate(csv_files):
text = open(path+i, "r", errors='replace')
raw = text.readlines()
x=x+1
print(i)
if len(raw)!=0:
sd = raw
sublist = []
result = []
for element in sd:
if element.replace(" ","") == '\n':
if sublist:
result.append(''.join(sublist))
sublist = []
else:
sublist.append(element)
if sublist:
result.append(''.join(sublist))
sd = list(map(lambda s: s.strip().replace('\n', ' '), result))
sd = front_trunc(sd)
if len(sd) != 0:
sd.reverse()
sd = end_trunc(sd)
sd.reverse()
else:
print(i)
break
new_file_path = output_folder + 'processed' + i
#with open(new_file_path, "w") as f:
# for line in sd:
# f.write("%s\n" % line)
print(sd)
print(new_file_path)
print("------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment