Created
August 9, 2021 23:12
-
-
Save ArtemisDicoTiar/fa26f781f3e26cc3c9057f41d326f47e to your computer and use it in GitHub Desktop.
huggingface custom dataset loading script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
"""TODO: Add a description here.""" | |
import csv | |
import json | |
import os | |
import datasets | |
# TODO: Add BibTeX citation | |
# Find for instance the citation on arxiv or on the dataset repo/website | |
_CITATION = """\ | |
@InProceedings{wisdomify:storyteller, | |
title = {Korean proverb definitions and examples}, | |
author={Jongyoon Kim, Yubin Kim, Yongtaek Im | |
}, | |
year={2021} | |
} | |
""" | |
# TODO: Add description of the dataset here | |
# You can copy an official description | |
_DESCRIPTION = """\ | |
This new dataset is designed to provide forward and reverse dictionary of Korean proverbs. | |
""" | |
# TODO: Add a link to an official homepage for the dataset here | |
_HOMEPAGE = "" | |
# TODO: Add the licence for the dataset here if you can find it | |
_LICENSE = "" | |
# TODO: Add link to the official dataset URLs here | |
# The HuggingFace dataset library don't host the datasets but only point to the original files | |
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) | |
_URLs = { | |
'definition': "https://www.dropbox.com/s/4uh564afaimtob3/definition.zip?dl=1", | |
'example': "https://www.dropbox.com/s/adlt9n6x5gjs0a6/example.zip?dl=1", | |
} | |
class Storyteller(datasets.GeneratorBasedBuilder): | |
"""TODO: Short description of my dataset.""" | |
VERSION = datasets.Version("0.0.0") # version must be "x.y.z' form | |
# This is an example of a dataset with multiple configurations. | |
# If you don't want/need to define several sub-sets in your dataset, | |
# just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. | |
# If you need to make complex sub-parts in the datasets with configurable options | |
# You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig | |
# BUILDER_CONFIG_CLASS = MyBuilderConfig | |
# You will be able to load one or the other configurations in the following list with | |
# dataset = datasets.load_dataset('my_dataset', 'first_domain') | |
# dataset = datasets.load_dataset('my_dataset', 'second_domain') | |
BUILDER_CONFIGS = [ | |
datasets.BuilderConfig(name="definition", version=VERSION, description="definition"), | |
datasets.BuilderConfig(name="example", version=VERSION, description="example"), | |
] | |
DEFAULT_CONFIG_NAME = "definition" | |
def _info(self): | |
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the | |
# dataset | |
if self.config.name == "definition": # This is the name of the configuration selected in BUILDER_CONFIGS above | |
features = datasets.Features( | |
{ | |
"wisdom": datasets.Value("string"), | |
"def": datasets.Value("string"), | |
# These are the features of your dataset like images, labels ... | |
} | |
) | |
else: # This is an example to show how to have different features for "first_domain" and "second_domain" | |
features = datasets.Features( | |
{ | |
"wisdom": datasets.Value("string"), | |
"eg": datasets.Value("string"), | |
# These are the features of your dataset like images, labels ... | |
} | |
) | |
return datasets.DatasetInfo( | |
# This is the description that will appear on the datasets page. | |
description=_DESCRIPTION, | |
# This defines the different columns of the dataset and their types | |
features=features, # Here we define them above because they are different between the two configurations | |
# If there's a common (input, target) tuple from the features, | |
# specify them here. They'll be used if as_supervised=True in | |
# builder.as_dataset. | |
supervised_keys=None, | |
# Homepage of the dataset for documentation | |
homepage=_HOMEPAGE, | |
# License for the dataset if available | |
license=_LICENSE, | |
# Citation for the dataset | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
# TODO: This method is tasked with downloading/extracting the dataset and defining the splits depending on the | |
# configuration If several configurations are possible (listed in BUILDER_CONFIGS), the configuration | |
# selected by the user is in self.config.name dl_manager is a datasets.download.DownloadManager that can be | |
# used to download and extract URLs It can accept any type or nested list/dict and will give back the same | |
# structure with the url replaced with path to local files. By default the archives will be extracted and a | |
# path to a cached folder where they are extracted is returned instead of the archive | |
downloaded_files = dl_manager.download_and_extract(_URLs[self.config.name]) | |
dtp = 'def' if self.config.name == "definition" else 'eg' | |
train_path = os.path.join(downloaded_files, f'train_wisdom2{dtp}.tsv') | |
val_path = os.path.join(downloaded_files, f'val_wisdom2{dtp}.tsv') | |
test_path = os.path.join(downloaded_files, f'test_wisdom2{dtp}.tsv') | |
# train_path = downloaded_files["train"] | |
# val_path = downloaded_files["validation"] | |
# test_path = downloaded_files["test"] | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"filepath": train_path, | |
"split": "train" | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.VALIDATION, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"filepath": val_path, | |
"split": "validation" | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.TEST, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"filepath": test_path, | |
"split": "test" | |
}, | |
), | |
] | |
def _generate_examples( | |
self, filepath, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` | |
): | |
""" Yields examples as (key, example) tuples. """ | |
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. | |
# The `key` is here for legacy reason (tfds) and is not important in itself. | |
with open(filepath, encoding="utf-8") as f: | |
tsv_reader = csv.reader(f, delimiter="\t") | |
for id_, row in enumerate(tsv_reader): | |
if id_ == 0: | |
continue # first row shows column info | |
if self.config.name == "definition": | |
yield id_, { | |
"wisdom": row[0], | |
"def": row[1], | |
} | |
else: | |
yield id_, { | |
"wisdom": row[0], | |
"eg": row[1], | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment