Created
February 13, 2024 22:26
-
-
Save darkacorn/f786564868357cde5894ef6e2c6f64cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
model = SentenceTransformer('TaylorAI/bge-micro') | |
import os | |
import re | |
input_folder = "/home/o_0/Downloads/www.asstr.org/files/Collections/Kristen's_Collection/Book_Shelf_A/" | |
output_folder = "/aimodels/scrapehappy/cleaned_storys/" | |
import os | |
path = input_folder | |
all_files = os.listdir(path) | |
csv_files = list(filter(lambda f:f.endswith('.txt'), all_files)) | |
def remove_strings(lst): | |
result = [] | |
for element in lst: | |
alphabets = sum(c.isalpha() for c in element) | |
non_alphabets = sum(not c.isalpha() for c in element) | |
if non_alphabets <= alphabets: | |
result.append(element) | |
return result | |
def end_trunc(lst): | |
result = [] | |
for i in range(len(lst)): | |
if re.search("Kristen's collection", sd[i].strip()): | |
result.extend(lst[i+1:]) | |
break | |
else: | |
result=remove_strings(lst) | |
return result | |
def front_trunc(lst): | |
result = [] | |
for i in range(len(lst)): | |
text=" ".join(sd[i].split()) | |
if re.search("BECAUSE I DON'T CENSER MY COLLECTING",text) or re.search('FROM THE STORIES YOU READ',text) or re.search('This work is copyrighted to the author',text) or re.search('This story is copyrighted',text) : | |
result.extend(lst[i+1:]) | |
break | |
if len(result)==0: | |
result=front_trunc_emb(lst) | |
return result | |
def front_trunc_emb(result): | |
my_list=result[0:10] | |
embeddings = model.encode(my_list) | |
compare_element = "This work is copyrighted to the author" | |
compare_embedding = model.encode([compare_element]) | |
cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0] | |
closest_element_index = np.argmax(cosine_similarities) | |
return result[closest_element_index+1:] | |
def end_trunc_emb(result): | |
my_list=result[0:10] | |
embeddings = model.encode(my_list) | |
compare_element = "Please keep this story,and all erotic stories out of the hand of children" | |
compare_embedding = model.encode([compare_element]) | |
cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0] | |
closest_element_index = np.argmax(cosine_similarities) | |
return result[:1+closest_element_index] | |
x = 0 | |
for idx, i in enumerate(csv_files): | |
text = open(path+i, "r", errors='replace') | |
raw = text.readlines() | |
x=x+1 | |
print(i) | |
if len(raw)!=0: | |
sd = raw | |
sublist = [] | |
result = [] | |
for element in sd: | |
if element.replace(" ","") == '\n': | |
if sublist: | |
result.append(''.join(sublist)) | |
sublist = [] | |
else: | |
sublist.append(element) | |
if sublist: | |
result.append(''.join(sublist)) | |
sd = list(map(lambda s: s.strip().replace('\n', ' '), result)) | |
sd = front_trunc(sd) | |
if len(sd) != 0: | |
sd.reverse() | |
sd = end_trunc(sd) | |
sd.reverse() | |
else: | |
print(i) | |
break | |
new_file_path = output_folder + 'processed' + i | |
#with open(new_file_path, "w") as f: | |
# for line in sd: | |
# f.write("%s\n" % line) | |
print(sd) | |
print(new_file_path) | |
print("------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment