Created
November 6, 2023 08:47
-
-
Save andreaskoepf/e2fdfade9cd95d8aff09bdce594a3db3 to your computer and use it in GitHub Desktop.
OASST1 Huggingface compatible dataset generation scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
ds = load_dataset("/path/oasst1", name='ready') | |
train = ds['train'] | |
val = ds['validation'] | |
print(f'{len(train)=}') | |
print(f'{len(val)=}') | |
for i in range(5): | |
print(train[i]["message_tree_id"]) | |
ds.push_to_hub("OpenAssistant/[name]", private=True, token="...") | |
#test loading | |
#ds = load_dataset("OpenAssistant/oasst1") | |
#ds = load_dataset("OpenAssistant/oasst1") | |
#print(ds.keys()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2023 Open-Assistant Contributors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""OpenAssistant dataset loading script""" | |
import json | |
import os | |
import datasets | |
_CITATION = """\ | |
@InProceedings{huggingface:dataset, | |
title = {OpenAssistant Conversations - Democratizing Large Language Model Alignment}, | |
author={OpenAssistant Authors}, | |
year={2023} | |
} | |
""" | |
_DESCRIPTION = """\ | |
In an effort to democratize research on large-scale alignment, | |
we release OpenAssistant Conversations, a human-generated, human-annotated | |
assistant-style conversation corpus consisting of over 100,000 messages distributed | |
across 10,940 conversations in 32 different languages, annotated with over 200,000 | |
quality ratings. The corpus is a product of a worldwide crowd-sourcing effort | |
involving over 13,500 volunteers. To demonstrate the OpenAssistant Conversations | |
dataset’s effectiveness, we present the first fully open-source large-scale instruction- | |
tuned model. | |
""" | |
_HOMEPAGE = "https://open-assistant.io/" | |
_LICENSE = "Apache v2.0" | |
_URLS = { | |
#"ready": { "train": "https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.messages.jsonl.gz" }, | |
"ready": { | |
"train": "path/DATE_oasst_ready_train.messages.jsonl", | |
"valid": "path/DATE_oasst_ready_val.messages.jsonl" | |
}, | |
"all": { "train":"https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_all.messages.jsonl.gz" }, | |
"prompts": { "train":"https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_prompts.messages.jsonl.gz" }, | |
"spam": { "train":"https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_spam.messages.jsonl.gz" }, | |
} | |
class Oasst1(datasets.GeneratorBasedBuilder): | |
"""OpenAssistant Conversations""" | |
VERSION = datasets.Version("1.0.0") | |
# If you need to make complex sub-parts in the datasets with configurable options | |
# You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig | |
# BUILDER_CONFIG_CLASS = MyBuilderConfig | |
# You will be able to load one or the other configurations in the following list with | |
BUILDER_CONFIGS = [ | |
datasets.BuilderConfig( | |
name="ready", | |
version=VERSION, | |
description="Messages of message trees in ready_for_export state", | |
), | |
datasets.BuilderConfig( | |
name="all", | |
version=VERSION, | |
description="All messages including spam, deleted and synthetic", | |
), | |
datasets.BuilderConfig( | |
name="prompts", | |
version=VERSION, | |
description="Only prompts", | |
), | |
datasets.BuilderConfig( | |
name="spam", | |
version=VERSION, | |
description="Messages which were deleted or rejected during spam review", | |
), | |
] | |
DEFAULT_CONFIG_NAME = "ready" | |
def _info(self): | |
if self.config.name in _URLS: | |
features = datasets.Features( | |
{ | |
"message_id": datasets.Value("string"), | |
"parent_id": datasets.Value("string"), | |
"user_id": datasets.Value("string"), | |
"created_date": datasets.Value("string"), | |
"text": datasets.Value("string"), | |
"role": datasets.Value("string"), | |
"lang": datasets.Value("string"), | |
"review_count": datasets.Value("int32"), | |
"review_result": datasets.Value("bool"), | |
"deleted": datasets.Value("bool"), | |
"rank": datasets.Value("int32"), | |
"synthetic": datasets.Value("bool"), | |
"model_name": datasets.Value("string"), | |
"detoxify": { | |
"toxicity": datasets.Value("float64"), | |
"severe_toxicity": datasets.Value("float64"), | |
"obscene": datasets.Value("float64"), | |
"identity_attack": datasets.Value("float64"), | |
"insult": datasets.Value("float64"), | |
"threat": datasets.Value("float64"), | |
"sexual_explicit": datasets.Value("float64"), | |
}, | |
"message_tree_id": datasets.Value("string"), | |
"tree_state": datasets.Value("string"), | |
"emojis": datasets.Sequence( | |
feature={"name": datasets.Value("string"), "count": datasets.Value("int32")} | |
), | |
"labels": datasets.Sequence( | |
feature={ | |
"name": datasets.Value("string"), | |
"value": datasets.Value("float64"), | |
"count": datasets.Value("int32"), | |
} | |
), | |
} | |
) | |
else: | |
raise RuntimeError("Unsupported configuration") | |
return datasets.DatasetInfo( | |
description=_DESCRIPTION, | |
features=features, | |
homepage=_HOMEPAGE, | |
license=_LICENSE, | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration | |
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name | |
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS | |
# It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. | |
# By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive | |
print("_split_generators", self.config.name) | |
urls = _URLS[self.config.name] | |
data_dir = dl_manager.download_and_extract(urls) | |
print("data_dir", data_dir) | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"filepath": data_dir["train"], | |
"split": "train", | |
}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.VALIDATION, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={ | |
"filepath": data_dir["valid"], | |
"split": "dev", | |
}, | |
), | |
] | |
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators` | |
def _generate_examples(self, filepath, split): | |
print("_generate_examples", filepath, split) | |
with open(filepath, encoding="utf-8") as f: | |
for line in f: | |
msg = json.loads(line) | |
if "events" in msg: | |
msg.pop("events") | |
if "emojis" in msg: | |
if msg["emojis"]: | |
msg["emojis"] = [{"name": k, "count": v} for k, v in msg["emojis"].items()] | |
else: | |
msg["emojis"] = None | |
if "labels" in msg: | |
if msg["labels"]: | |
msg["labels"] = [ | |
{"name": k, "value": v["value"], "count": v["count"]} for k, v in msg["labels"].items() | |
] | |
else: | |
msg["labels"] = None | |
for optional_field in ('parent_id','rank', 'model_name', 'emojis', 'labels', 'detoxify', 'review_result'): | |
if optional_field not in msg: | |
msg[optional_field] = None | |
yield msg["message_id"], msg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment