ArtemisDicoTiar/study.py

## study.py
# coding=utf-8
"""TODO: Add a description here."""


import csv
import json
import os

import datasets


# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{wisdomify:storyteller,
title = {Korean proverb definitions and examples},
author={Jongyoon Kim, Yubin Kim, Yongtaek Im
},
year={2021}
}
"""

# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This new dataset is designed to provide forward and reverse dictionary of Korean proverbs.
"""

# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {
    'definition': "https://www.dropbox.com/s/4uh564afaimtob3/definition.zip?dl=1",
    'example': "https://www.dropbox.com/s/adlt9n6x5gjs0a6/example.zip?dl=1",
}


class Storyteller(datasets.GeneratorBasedBuilder):
    """TODO: Short description of my dataset."""

    VERSION = datasets.Version("0.0.0")  # version must be "x.y.z' form

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

    # If you need to make complex sub-parts in the datasets with configurable options
    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
    # BUILDER_CONFIG_CLASS = MyBuilderConfig

    # You will be able to load one or the other configurations in the following list with
    # dataset = datasets.load_dataset('my_dataset', 'first_domain')
    # dataset = datasets.load_dataset('my_dataset', 'second_domain')
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="definition", version=VERSION, description="definition"),
        datasets.BuilderConfig(name="example", version=VERSION, description="example"),
    ]

    DEFAULT_CONFIG_NAME = "definition"

    def _info(self):
        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the
        #  dataset
        if self.config.name == "definition":  # This is the name of the configuration selected in BUILDER_CONFIGS above
            features = datasets.Features(
                {
                    "wisdom": datasets.Value("string"),
                    "def": datasets.Value("string"),
                    # These are the features of your dataset like images, labels ...
                }
            )
        else:  # This is an example to show how to have different features for "first_domain" and "second_domain"
            features = datasets.Features(
                {
                    "wisdom": datasets.Value("string"),
                    "eg": datasets.Value("string"),
                    # These are the features of your dataset like images, labels ...
                }
            )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: This method is tasked with downloading/extracting the dataset and defining the splits depending on the
        #  configuration If several configurations are possible (listed in BUILDER_CONFIGS), the configuration
        #  selected by the user is in self.config.name dl_manager is a datasets.download.DownloadManager that can be
        #  used to download and extract URLs It can accept any type or nested list/dict and will give back the same
        #  structure with the url replaced with path to local files. By default the archives will be extracted and a
        #  path to a cached folder where they are extracted is returned instead of the archive
        downloaded_files = dl_manager.download_and_extract(_URLs[self.config.name])

        dtp = 'def' if self.config.name == "definition" else 'eg'

        train_path = os.path.join(downloaded_files, f'train_wisdom2{dtp}.tsv')
        val_path = os.path.join(downloaded_files, f'val_wisdom2{dtp}.tsv')
        test_path = os.path.join(downloaded_files, f'test_wisdom2{dtp}.tsv')
        # train_path = downloaded_files["train"]
        # val_path = downloaded_files["validation"]
        # test_path = downloaded_files["test"]

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": train_path,
                    "split": "train"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": val_path,
                    "split": "validation"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": test_path,
                    "split": "test"
                },
            ),
        ]

    def _generate_examples(
        self, filepath, split  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    ):
        """ Yields examples as (key, example) tuples. """
        # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
        # The `key` is here for legacy reason (tfds) and is not important in itself.

        with open(filepath, encoding="utf-8") as f:
            tsv_reader = csv.reader(f, delimiter="\t")
            for id_, row in enumerate(tsv_reader):
                if id_ == 0:
                    continue  # first row shows column info

                if self.config.name == "definition":
                    yield id_, {
                        "wisdom": row[0],
                        "def": row[1],
                    }
                else:
                    yield id_, {
                        "wisdom": row[0],
                        "eg": row[1],
                    }
	# coding=utf-8
	"""TODO: Add a description here."""


	import csv
	import json
	import os

	import datasets


	# TODO: Add BibTeX citation
	# Find for instance the citation on arxiv or on the dataset repo/website
	_CITATION = """\
	@InProceedings{wisdomify:storyteller,
	title = {Korean proverb definitions and examples},
	author={Jongyoon Kim, Yubin Kim, Yongtaek Im
	},
	year={2021}
	}
	"""

	# TODO: Add description of the dataset here
	# You can copy an official description
	_DESCRIPTION = """\
	This new dataset is designed to provide forward and reverse dictionary of Korean proverbs.
	"""

	# TODO: Add a link to an official homepage for the dataset here
	_HOMEPAGE = ""

	# TODO: Add the licence for the dataset here if you can find it
	_LICENSE = ""

	# TODO: Add link to the official dataset URLs here
	# The HuggingFace dataset library don't host the datasets but only point to the original files
	# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
	_URLs = {
	'definition': "https://www.dropbox.com/s/4uh564afaimtob3/definition.zip?dl=1",
	'example': "https://www.dropbox.com/s/adlt9n6x5gjs0a6/example.zip?dl=1",
	}


	class Storyteller(datasets.GeneratorBasedBuilder):
	"""TODO: Short description of my dataset."""

	VERSION = datasets.Version("0.0.0") # version must be "x.y.z' form

	# This is an example of a dataset with multiple configurations.
	# If you don't want/need to define several sub-sets in your dataset,
	# just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

	# If you need to make complex sub-parts in the datasets with configurable options
	# You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
	# BUILDER_CONFIG_CLASS = MyBuilderConfig

	# You will be able to load one or the other configurations in the following list with
	# dataset = datasets.load_dataset('my_dataset', 'first_domain')
	# dataset = datasets.load_dataset('my_dataset', 'second_domain')
	BUILDER_CONFIGS = [
	datasets.BuilderConfig(name="definition", version=VERSION, description="definition"),
	datasets.BuilderConfig(name="example", version=VERSION, description="example"),
	]

	DEFAULT_CONFIG_NAME = "definition"

	def _info(self):
	# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the
	# dataset
	if self.config.name == "definition": # This is the name of the configuration selected in BUILDER_CONFIGS above
	features = datasets.Features(
	{
	"wisdom": datasets.Value("string"),
	"def": datasets.Value("string"),
	# These are the features of your dataset like images, labels ...
	}
	)
	else: # This is an example to show how to have different features for "first_domain" and "second_domain"
	features = datasets.Features(
	{
	"wisdom": datasets.Value("string"),
	"eg": datasets.Value("string"),
	# These are the features of your dataset like images, labels ...
	}
	)
	return datasets.DatasetInfo(
	# This is the description that will appear on the datasets page.
	description=_DESCRIPTION,
	# This defines the different columns of the dataset and their types
	features=features, # Here we define them above because they are different between the two configurations
	# If there's a common (input, target) tuple from the features,
	# specify them here. They'll be used if as_supervised=True in
	# builder.as_dataset.
	supervised_keys=None,
	# Homepage of the dataset for documentation
	homepage=_HOMEPAGE,
	# License for the dataset if available
	license=_LICENSE,
	# Citation for the dataset
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	# TODO: This method is tasked with downloading/extracting the dataset and defining the splits depending on the
	# configuration If several configurations are possible (listed in BUILDER_CONFIGS), the configuration
	# selected by the user is in self.config.name dl_manager is a datasets.download.DownloadManager that can be
	# used to download and extract URLs It can accept any type or nested list/dict and will give back the same
	# structure with the url replaced with path to local files. By default the archives will be extracted and a
	# path to a cached folder where they are extracted is returned instead of the archive
	downloaded_files = dl_manager.download_and_extract(_URLs[self.config.name])

	dtp = 'def' if self.config.name == "definition" else 'eg'

	train_path = os.path.join(downloaded_files, f'train_wisdom2{dtp}.tsv')
	val_path = os.path.join(downloaded_files, f'val_wisdom2{dtp}.tsv')
	test_path = os.path.join(downloaded_files, f'test_wisdom2{dtp}.tsv')
	# train_path = downloaded_files["train"]
	# val_path = downloaded_files["validation"]
	# test_path = downloaded_files["test"]

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"filepath": train_path,
	"split": "train"
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"filepath": val_path,
	"split": "validation"
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"filepath": test_path,
	"split": "test"
	},
	),
	]

	def _generate_examples(
	self, filepath, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
	):
	""" Yields examples as (key, example) tuples. """
	# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
	# The `key` is here for legacy reason (tfds) and is not important in itself.

	with open(filepath, encoding="utf-8") as f:
	tsv_reader = csv.reader(f, delimiter="\t")
	for id_, row in enumerate(tsv_reader):
	if id_ == 0:
	continue # first row shows column info

	if self.config.name == "definition":
	yield id_, {
	"wisdom": row[0],
	"def": row[1],
	}
	else:
	yield id_, {
	"wisdom": row[0],
	"eg": row[1],
	}