Skip to content

Instantly share code, notes, and snippets.

@HaritzPuerto
Created July 16, 2024 15:08
Show Gist options
  • Save HaritzPuerto/3543a70b5bfe50c8fec6977c716efe67 to your computer and use it in GitHub Desktop.
Save HaritzPuerto/3543a70b5bfe50c8fec6977c716efe67 to your computer and use it in GitHub Desktop.
import zstandard as zstd
import json
from tqdm.notebook import tqdm
from datasets import load_dataset
pile_path = "the_pile/train/00.jsonl.zst"
arxiv = []
with zstd.open(pile_path, 'r') as f:
for i, line in enumerate(tqdm(f)):
doc = json.loads(line)
source = doc['meta']['pile_set_name']
if source == "ArXiv":
arxiv.append(doc)
with open("/tmp/arxiv.jsonl", "w") as f:
for entry in arxiv:
json.dump(entry, f)
f.write("\n")
pile_arxiv = load_dataset("json", data_files={'train': "/tmp/arxiv.jsonl", 'validation': "/tmp/arxiv_val.jsonl", 'test': "/tmp/arxiv_test.jsonl"})
pile_arxiv.push_to_hub("haritzpuerto/the_pile_arxiv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment