Skip to content

Instantly share code, notes, and snippets.

@jxmorris12
Created October 25, 2023 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jxmorris12/aadf73a2759b08d781f90e4f62c0eec0 to your computer and use it in GitHub Desktop.
Save jxmorris12/aadf73a2759b08d781f90e4f62c0eec0 to your computer and use it in GitHub Desktop.
load a dataset from JSON and upload it to huggingface
import argparse
import glob
import datasets
import pandas as pd
def load_datasets(data_folder):
train_file = glob.glob(f"{data_folder}/train*.jsonl")[0]
test_file = f"{data_folder}/test.jsonl"
dev_file = glob.glob(f"{data_folder}/dev*.jsonl")[0]
train_df = pd.read_json(train_file, lines=True)
test_df = pd.read_json(test_file, lines=True)
dev_df = pd.read_json(dev_file, lines=True)
train_ds = datasets.Dataset.from_pandas(train_df)
test_ds = datasets.Dataset.from_pandas(test_df)
dev_ds = datasets.Dataset.from_pandas(dev_df)
for column in train_ds.column_names:
if column not in test_ds.column_names:
test_ds = test_ds.add_column(column, ["-1"] * len(test_ds))
print(f"added column `{column}` to test dataset.")
return datasets.DatasetDict({
"train": train_ds,
"test": test_ds,
"dev": dev_ds,
})
def main():
parser = argparse.ArgumentParser(description="Load datasets from JSONL files.")
parser.add_argument("dataset", type=str, help="Path to the folder containing JSONL files.")
args = parser.parse_args()
dataset_dict = load_datasets(args.dataset)
dataset_dict.push_to_hub(f"jxm/{args.dataset}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment