This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#First, retrieve documents | |
setDocs1 = [] | |
allDocuments = [] | |
for file_name in os.listdir("/home/vagrant/shared/Test/1"): | |
file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8") | |
aux = file.read() | |
setDocs1.append(aux) | |
allDocuments.append(aux) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
def confusion(prediction, truth): | |
""" Returns the confusion matrix for the values in the `prediction` and `truth` | |
tensors, i.e. the amount of positions where the values of `prediction` | |
and `truth` are | |
- 1 and 1 (True Positive) | |
- 1 and 0 (False Positive) | |
- 0 and 0 (True Negative) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zstandard as zstd | |
import json | |
from tqdm.notebook import tqdm | |
from datasets import load_dataset | |
pile_path = "the_pile/train/00.jsonl.zst" | |
arxiv = [] | |
with zstd.open(pile_path, 'r') as f: | |
for i, line in enumerate(tqdm(f)): |