Skip to content

Instantly share code, notes, and snippets.

@Mageswaran1989
Created October 28, 2021 05:01
Show Gist options
  • Save Mageswaran1989/1ffadeb9a770eee7d8b8010a7163cafb to your computer and use it in GitHub Desktop.
Save Mageswaran1989/1ffadeb9a770eee7d8b8010a7163cafb to your computer and use it in GitHub Desktop.
sroie2019_dataset.py
class SROIE2019(datasets.GeneratorBasedBuilder):
"""SROIE2019 dataset."""
BUILDER_CONFIGS = [
SROIE2019Config(name="SROIE2019", version=datasets.Version("1.0.0"), description="SROIE2019 dataset"),
]
def __init__(self,
*args,
cache_dir,
url="https://raw.githubusercontent.com/gyan42/mozhi-datasets/main/sroie2019/version1/",
train_file="train.txt",
val_file="valid.txt",
test_file="test.txt",
ner_tags=("company", "date", "address", "total", "O"),
**kwargs):
self._ner_tags = ner_tags
self._url = url
self._train_file = train_file
self._val_file = val_file
self._test_file = test_file
super(SROIE2019, self).__init__(*args, cache_dir=cache_dir, **kwargs)
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Value("string"),
"tokens": datasets.Sequence(datasets.Value("string")),
"ner_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=sorted(list(self._ner_tags))
)
)
}
),
supervised_keys=None,
homepage="",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
urls_to_download = {
"train": f"{self._url}{self._train_file}",
"dev": f"{self._url}{self._val_file}",
"test": f"{self._url}{self._test_file}",
}
downloaded_files = dl_manager.download_and_extract(urls_to_download)
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
]
def _generate_examples(self, filepath):
logger.info("⏳ Generating examples from = %s", filepath)
with open(filepath, encoding="utf-8") as f:
guid = 0
tokens = []
ner_tags = []
for line in f:
if line == "" or line == "\n":
if tokens:
yield guid, {
"id": str(guid),
"tokens": tokens,
"ner_tags": ner_tags,
}
guid += 1
tokens = []
ner_tags = []
else:
# SROIE2019 tokens are space separated
splits = line.split(" ")
tokens.append(splits[0])
ner_tags.append(splits[1].rstrip())
# last example
yield guid, {
"id": str(guid),
"tokens": tokens,
"ner_tags": ner_tags,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment