Last active
December 1, 2022 07:09
-
-
Save saattrupdan/1cb8379232fdec6e943dc84595a85e7c to your computer and use it in GitHub Desktop.
The DanFEVER dataset (https://huggingface.co/datasets/strombergnlp/danfever) only comes with a training split, making evaluations on it non-reproducible. This gist creates validation- and test splits in a deterministic fashion.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
# Load the DanFEVER dataset | |
dataset = load_dataset("strombergnlp/danfever", split="train") | |
# Convert the dataset to a Pandas DataFrame | |
df = dataset.to_pandas() | |
# Get list unique `evidence_extract` values, along with their counts | |
evidence_extract_counts = df.evidence_extract.value_counts() | |
# Pick the evidence extracts for the test split, being the maximum amount that | |
# sum up below 1,000 samples | |
test_evidence_extract = evidence_extract_counts[ | |
evidence_extract_counts.cumsum() < 1000 | |
].index.tolist() | |
# Pick the evidence extracts for the validation split, being the maximum amount | |
# that sum up below 500 samples, and which are not in the test split | |
val_evidence_extract = evidence_extract_counts[ | |
(evidence_extract_counts.cumsum() < 1500) | |
& (~evidence_extract_counts.index.isin(test_evidence_extract)) | |
].index.tolist() | |
# Pick the evidence extracts for the train split, being the rest | |
train_evidence_extract = evidence_extract_counts[ | |
~evidence_extract_counts.index.isin( | |
test_evidence_extract + val_evidence_extract | |
) | |
].index.tolist() | |
# Convert the dataframes back to datasets | |
train_dataset = Dataset.from_pandas( | |
df[df.evidence_extract.isin(train_evidence_extract)], | |
preserve_index=False, | |
) | |
val_dataset = Dataset.from_pandas( | |
df[df.evidence_extract.isin(val_evidence_extract)], | |
preserve_index=False, | |
) | |
test_dataset = Dataset.from_pandas( | |
df[df.evidence_extract.isin(test_evidence_extract)], | |
preserve_index=False, | |
) | |
# Package the datasets into a DatasetDict | |
dataset_dict = DatasetDict( | |
dict(train=train_dataset, val=val_dataset, test=test_dataset) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment