Created
October 28, 2021 05:01
-
-
Save Mageswaran1989/1ffadeb9a770eee7d8b8010a7163cafb to your computer and use it in GitHub Desktop.
sroie2019_dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SROIE2019(datasets.GeneratorBasedBuilder): | |
"""SROIE2019 dataset.""" | |
BUILDER_CONFIGS = [ | |
SROIE2019Config(name="SROIE2019", version=datasets.Version("1.0.0"), description="SROIE2019 dataset"), | |
] | |
def __init__(self, | |
*args, | |
cache_dir, | |
url="https://raw.githubusercontent.com/gyan42/mozhi-datasets/main/sroie2019/version1/", | |
train_file="train.txt", | |
val_file="valid.txt", | |
test_file="test.txt", | |
ner_tags=("company", "date", "address", "total", "O"), | |
**kwargs): | |
self._ner_tags = ner_tags | |
self._url = url | |
self._train_file = train_file | |
self._val_file = val_file | |
self._test_file = test_file | |
super(SROIE2019, self).__init__(*args, cache_dir=cache_dir, **kwargs) | |
def _info(self): | |
return datasets.DatasetInfo( | |
description=_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"id": datasets.Value("string"), | |
"tokens": datasets.Sequence(datasets.Value("string")), | |
"ner_tags": datasets.Sequence( | |
datasets.features.ClassLabel( | |
names=sorted(list(self._ner_tags)) | |
) | |
) | |
} | |
), | |
supervised_keys=None, | |
homepage="", | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
urls_to_download = { | |
"train": f"{self._url}{self._train_file}", | |
"dev": f"{self._url}{self._val_file}", | |
"test": f"{self._url}{self._test_file}", | |
} | |
downloaded_files = dl_manager.download_and_extract(urls_to_download) | |
return [ | |
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), | |
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}), | |
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}), | |
] | |
def _generate_examples(self, filepath): | |
logger.info("⏳ Generating examples from = %s", filepath) | |
with open(filepath, encoding="utf-8") as f: | |
guid = 0 | |
tokens = [] | |
ner_tags = [] | |
for line in f: | |
if line == "" or line == "\n": | |
if tokens: | |
yield guid, { | |
"id": str(guid), | |
"tokens": tokens, | |
"ner_tags": ner_tags, | |
} | |
guid += 1 | |
tokens = [] | |
ner_tags = [] | |
else: | |
# SROIE2019 tokens are space separated | |
splits = line.split(" ") | |
tokens.append(splits[0]) | |
ner_tags.append(splits[1].rstrip()) | |
# last example | |
yield guid, { | |
"id": str(guid), | |
"tokens": tokens, | |
"ner_tags": ner_tags, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment