Skip to content

Instantly share code, notes, and snippets.

@ArtemisDicoTiar
Created August 9, 2021 23:12
Show Gist options
  • Save ArtemisDicoTiar/fa26f781f3e26cc3c9057f41d326f47e to your computer and use it in GitHub Desktop.
Save ArtemisDicoTiar/fa26f781f3e26cc3c9057f41d326f47e to your computer and use it in GitHub Desktop.
huggingface custom dataset loading script
# coding=utf-8
"""TODO: Add a description here."""
import csv
import json
import os
import datasets
# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{wisdomify:storyteller,
title = {Korean proverb definitions and examples},
author={Jongyoon Kim, Yubin Kim, Yongtaek Im
},
year={2021}
}
"""
# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This new dataset is designed to provide forward and reverse dictionary of Korean proverbs.
"""
# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {
'definition': "https://www.dropbox.com/s/4uh564afaimtob3/definition.zip?dl=1",
'example': "https://www.dropbox.com/s/adlt9n6x5gjs0a6/example.zip?dl=1",
}
class Storyteller(datasets.GeneratorBasedBuilder):
"""TODO: Short description of my dataset."""
VERSION = datasets.Version("0.0.0") # version must be "x.y.z' form
# This is an example of a dataset with multiple configurations.
# If you don't want/need to define several sub-sets in your dataset,
# just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
# If you need to make complex sub-parts in the datasets with configurable options
# You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
# BUILDER_CONFIG_CLASS = MyBuilderConfig
# You will be able to load one or the other configurations in the following list with
# dataset = datasets.load_dataset('my_dataset', 'first_domain')
# dataset = datasets.load_dataset('my_dataset', 'second_domain')
BUILDER_CONFIGS = [
datasets.BuilderConfig(name="definition", version=VERSION, description="definition"),
datasets.BuilderConfig(name="example", version=VERSION, description="example"),
]
DEFAULT_CONFIG_NAME = "definition"
def _info(self):
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the
# dataset
if self.config.name == "definition": # This is the name of the configuration selected in BUILDER_CONFIGS above
features = datasets.Features(
{
"wisdom": datasets.Value("string"),
"def": datasets.Value("string"),
# These are the features of your dataset like images, labels ...
}
)
else: # This is an example to show how to have different features for "first_domain" and "second_domain"
features = datasets.Features(
{
"wisdom": datasets.Value("string"),
"eg": datasets.Value("string"),
# These are the features of your dataset like images, labels ...
}
)
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO: This method is tasked with downloading/extracting the dataset and defining the splits depending on the
# configuration If several configurations are possible (listed in BUILDER_CONFIGS), the configuration
# selected by the user is in self.config.name dl_manager is a datasets.download.DownloadManager that can be
# used to download and extract URLs It can accept any type or nested list/dict and will give back the same
# structure with the url replaced with path to local files. By default the archives will be extracted and a
# path to a cached folder where they are extracted is returned instead of the archive
downloaded_files = dl_manager.download_and_extract(_URLs[self.config.name])
dtp = 'def' if self.config.name == "definition" else 'eg'
train_path = os.path.join(downloaded_files, f'train_wisdom2{dtp}.tsv')
val_path = os.path.join(downloaded_files, f'val_wisdom2{dtp}.tsv')
test_path = os.path.join(downloaded_files, f'test_wisdom2{dtp}.tsv')
# train_path = downloaded_files["train"]
# val_path = downloaded_files["validation"]
# test_path = downloaded_files["test"]
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": train_path,
"split": "train"
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": val_path,
"split": "validation"
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": test_path,
"split": "test"
},
),
]
def _generate_examples(
self, filepath, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
):
""" Yields examples as (key, example) tuples. """
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is here for legacy reason (tfds) and is not important in itself.
with open(filepath, encoding="utf-8") as f:
tsv_reader = csv.reader(f, delimiter="\t")
for id_, row in enumerate(tsv_reader):
if id_ == 0:
continue # first row shows column info
if self.config.name == "definition":
yield id_, {
"wisdom": row[0],
"def": row[1],
}
else:
yield id_, {
"wisdom": row[0],
"eg": row[1],
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment