Haritz Puerto HaritzPuerto

## pyLDAvisException.py
#First, retrieve documents

setDocs1 = []
allDocuments = []
for file_name in os.listdir("/home/vagrant/shared/Test/1"):
    file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8")
    aux = file.read()
    setDocs1.append(aux)
    allDocuments.append(aux)

## confusion_matrix_between_two_pytorch_tensors.py
import torch


def confusion(prediction, truth):
    """ Returns the confusion matrix for the values in the `prediction` and `truth`
    tensors, i.e. the amount of positions where the values of `prediction`
    and `truth` are
    - 1 and 1 (True Positive)
    - 1 and 0 (False Positive)
    - 0 and 0 (True Negative)

## process_the_pile_partition.py
import zstandard as zstd
import json
from tqdm.notebook import tqdm
from datasets import load_dataset

pile_path = "the_pile/train/00.jsonl.zst"

arxiv = []
with zstd.open(pile_path, 'r') as f:
    for i, line in enumerate(tqdm(f)):
	#First, retrieve documents

	setDocs1 = []
	allDocuments = []
	for file_name in os.listdir("/home/vagrant/shared/Test/1"):
	file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8")
	aux = file.read()
	setDocs1.append(aux)
	allDocuments.append(aux)
	import torch


	def confusion(prediction, truth):
	""" Returns the confusion matrix for the values in the `prediction` and `truth`
	tensors, i.e. the amount of positions where the values of `prediction`
	and `truth` are
	- 1 and 1 (True Positive)
	- 1 and 0 (False Positive)
	- 0 and 0 (True Negative)
	import zstandard as zstd
	import json
	from tqdm.notebook import tqdm
	from datasets import load_dataset

	pile_path = "the_pile/train/00.jsonl.zst"

	arxiv = []
	with zstd.open(pile_path, 'r') as f:
	for i, line in enumerate(tqdm(f)):