darkacorn/gist:f786564868357cde5894ef6e2c6f64cf Secret

## gistfile1.txt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('TaylorAI/bge-micro')

import os
import re


input_folder = "/home/o_0/Downloads/www.asstr.org/files/Collections/Kristen's_Collection/Book_Shelf_A/"
output_folder = "/aimodels/scrapehappy/cleaned_storys/"


import os
path = input_folder

all_files = os.listdir(path)
csv_files = list(filter(lambda f:f.endswith('.txt'), all_files))


def remove_strings(lst):
    result = []
    for element in lst:
        alphabets = sum(c.isalpha() for c in element)
        non_alphabets = sum(not c.isalpha() for c in element)
        if non_alphabets <= alphabets:
            result.append(element)
    return result

def end_trunc(lst):
    result = []
    for i in range(len(lst)):
        if re.search("Kristen's collection", sd[i].strip()):
            result.extend(lst[i+1:])
            break
        else:
            result=remove_strings(lst)
    return result

def front_trunc(lst):
    result = []
    for i in range(len(lst)):
        text=" ".join(sd[i].split())
        if re.search("BECAUSE I DON'T CENSER MY COLLECTING",text) or re.search('FROM THE STORIES YOU READ',text) or re.search('This work is copyrighted to the author',text) or re.search('This story is copyrighted',text) :
            result.extend(lst[i+1:])
            break
    if len(result)==0:
        result=front_trunc_emb(lst)
    return result

def front_trunc_emb(result):
    my_list=result[0:10]
    embeddings = model.encode(my_list)

    compare_element = "This work is copyrighted to the author"
    compare_embedding = model.encode([compare_element])

    cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0]

    closest_element_index = np.argmax(cosine_similarities)

    return result[closest_element_index+1:]

def end_trunc_emb(result):
    my_list=result[0:10]
    embeddings = model.encode(my_list)

    compare_element = "Please keep this story,and all  erotic stories out of the hand of children"
    compare_embedding = model.encode([compare_element])

    cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0]

    closest_element_index = np.argmax(cosine_similarities)

    return result[:1+closest_element_index]


x = 0

for idx, i in enumerate(csv_files):
    text = open(path+i, "r", errors='replace')
    raw = text.readlines()
    x=x+1
    print(i)
    if len(raw)!=0:
        sd = raw
        sublist = []
        result = []
        for element in sd:
            if element.replace(" ","") == '\n':
                if sublist:
                    result.append(''.join(sublist))
                    sublist = []
            else:
                sublist.append(element)
        if sublist:
            result.append(''.join(sublist))
        sd = list(map(lambda s: s.strip().replace('\n', ' '), result))
        sd = front_trunc(sd)
        if len(sd) != 0:
            sd.reverse()
            sd = end_trunc(sd)
            sd.reverse()
        else:
            print(i)
            break
        new_file_path = output_folder + 'processed' + i

        #with open(new_file_path, "w") as f:
        #    for line in sd:
        #        f.write("%s\n" % line)

        print(sd)
        print(new_file_path)
        print("------------")
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	model = SentenceTransformer('TaylorAI/bge-micro')

	import os
	import re


	input_folder = "/home/o_0/Downloads/www.asstr.org/files/Collections/Kristen's_Collection/Book_Shelf_A/"
	output_folder = "/aimodels/scrapehappy/cleaned_storys/"


	import os
	path = input_folder

	all_files = os.listdir(path)
	csv_files = list(filter(lambda f:f.endswith('.txt'), all_files))


	def remove_strings(lst):
	result = []
	for element in lst:
	alphabets = sum(c.isalpha() for c in element)
	non_alphabets = sum(not c.isalpha() for c in element)
	if non_alphabets <= alphabets:
	result.append(element)
	return result

	def end_trunc(lst):
	result = []
	for i in range(len(lst)):
	if re.search("Kristen's collection", sd[i].strip()):
	result.extend(lst[i+1:])
	break
	else:
	result=remove_strings(lst)
	return result

	def front_trunc(lst):
	result = []
	for i in range(len(lst)):
	text=" ".join(sd[i].split())
	if re.search("BECAUSE I DON'T CENSER MY COLLECTING",text) or re.search('FROM THE STORIES YOU READ',text) or re.search('This work is copyrighted to the author',text) or re.search('This story is copyrighted',text) :
	result.extend(lst[i+1:])
	break
	if len(result)==0:
	result=front_trunc_emb(lst)
	return result

	def front_trunc_emb(result):
	my_list=result[0:10]
	embeddings = model.encode(my_list)

	compare_element = "This work is copyrighted to the author"
	compare_embedding = model.encode([compare_element])

	cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0]

	closest_element_index = np.argmax(cosine_similarities)

	return result[closest_element_index+1:]

	def end_trunc_emb(result):
	my_list=result[0:10]
	embeddings = model.encode(my_list)

	compare_element = "Please keep this story,and all erotic stories out of the hand of children"
	compare_embedding = model.encode([compare_element])

	cosine_similarities = cosine_similarity(compare_embedding, embeddings)[0]

	closest_element_index = np.argmax(cosine_similarities)

	return result[:1+closest_element_index]


	x = 0

	for idx, i in enumerate(csv_files):
	text = open(path+i, "r", errors='replace')
	raw = text.readlines()
	x=x+1
	print(i)
	if len(raw)!=0:
	sd = raw
	sublist = []
	result = []
	for element in sd:
	if element.replace(" ","") == '\n':
	if sublist:
	result.append(''.join(sublist))
	sublist = []
	else:
	sublist.append(element)
	if sublist:
	result.append(''.join(sublist))
	sd = list(map(lambda s: s.strip().replace('\n', ' '), result))
	sd = front_trunc(sd)
	if len(sd) != 0:
	sd.reverse()
	sd = end_trunc(sd)
	sd.reverse()
	else:
	print(i)
	break
	new_file_path = output_folder + 'processed' + i

	#with open(new_file_path, "w") as f:
	# for line in sd:
	# f.write("%s\n" % line)

	print(sd)
	print(new_file_path)
	print("------------")