Skip to content

Instantly share code, notes, and snippets.

@andreaskoepf
Created November 6, 2023 08:47
Show Gist options
  • Save andreaskoepf/e2fdfade9cd95d8aff09bdce594a3db3 to your computer and use it in GitHub Desktop.
Save andreaskoepf/e2fdfade9cd95d8aff09bdce594a3db3 to your computer and use it in GitHub Desktop.
OASST1 Huggingface compatible dataset generation scripts
from datasets import load_dataset
ds = load_dataset("/path/oasst1", name='ready')
train = ds['train']
val = ds['validation']
print(f'{len(train)=}')
print(f'{len(val)=}')
for i in range(5):
print(train[i]["message_tree_id"])
ds.push_to_hub("OpenAssistant/[name]", private=True, token="...")
#test loading
#ds = load_dataset("OpenAssistant/oasst1")
#ds = load_dataset("OpenAssistant/oasst1")
#print(ds.keys())
# Copyright 2023 Open-Assistant Contributors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""OpenAssistant dataset loading script"""
import json
import os
import datasets
_CITATION = """\
@InProceedings{huggingface:dataset,
title = {OpenAssistant Conversations - Democratizing Large Language Model Alignment},
author={OpenAssistant Authors},
year={2023}
}
"""
_DESCRIPTION = """\
In an effort to democratize research on large-scale alignment,
we release OpenAssistant Conversations, a human-generated, human-annotated
assistant-style conversation corpus consisting of over 100,000 messages distributed
across 10,940 conversations in 32 different languages, annotated with over 200,000
quality ratings. The corpus is a product of a worldwide crowd-sourcing effort
involving over 13,500 volunteers. To demonstrate the OpenAssistant Conversations
dataset’s effectiveness, we present the first fully open-source large-scale instruction-
tuned model.
"""
_HOMEPAGE = "https://open-assistant.io/"
_LICENSE = "Apache v2.0"
_URLS = {
#"ready": { "train": "https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.messages.jsonl.gz" },
"ready": {
"train": "path/DATE_oasst_ready_train.messages.jsonl",
"valid": "path/DATE_oasst_ready_val.messages.jsonl"
},
"all": { "train":"https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_all.messages.jsonl.gz" },
"prompts": { "train":"https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_prompts.messages.jsonl.gz" },
"spam": { "train":"https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_spam.messages.jsonl.gz" },
}
class Oasst1(datasets.GeneratorBasedBuilder):
"""OpenAssistant Conversations"""
VERSION = datasets.Version("1.0.0")
# If you need to make complex sub-parts in the datasets with configurable options
# You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
# BUILDER_CONFIG_CLASS = MyBuilderConfig
# You will be able to load one or the other configurations in the following list with
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="ready",
version=VERSION,
description="Messages of message trees in ready_for_export state",
),
datasets.BuilderConfig(
name="all",
version=VERSION,
description="All messages including spam, deleted and synthetic",
),
datasets.BuilderConfig(
name="prompts",
version=VERSION,
description="Only prompts",
),
datasets.BuilderConfig(
name="spam",
version=VERSION,
description="Messages which were deleted or rejected during spam review",
),
]
DEFAULT_CONFIG_NAME = "ready"
def _info(self):
if self.config.name in _URLS:
features = datasets.Features(
{
"message_id": datasets.Value("string"),
"parent_id": datasets.Value("string"),
"user_id": datasets.Value("string"),
"created_date": datasets.Value("string"),
"text": datasets.Value("string"),
"role": datasets.Value("string"),
"lang": datasets.Value("string"),
"review_count": datasets.Value("int32"),
"review_result": datasets.Value("bool"),
"deleted": datasets.Value("bool"),
"rank": datasets.Value("int32"),
"synthetic": datasets.Value("bool"),
"model_name": datasets.Value("string"),
"detoxify": {
"toxicity": datasets.Value("float64"),
"severe_toxicity": datasets.Value("float64"),
"obscene": datasets.Value("float64"),
"identity_attack": datasets.Value("float64"),
"insult": datasets.Value("float64"),
"threat": datasets.Value("float64"),
"sexual_explicit": datasets.Value("float64"),
},
"message_tree_id": datasets.Value("string"),
"tree_state": datasets.Value("string"),
"emojis": datasets.Sequence(
feature={"name": datasets.Value("string"), "count": datasets.Value("int32")}
),
"labels": datasets.Sequence(
feature={
"name": datasets.Value("string"),
"value": datasets.Value("float64"),
"count": datasets.Value("int32"),
}
),
}
)
else:
raise RuntimeError("Unsupported configuration")
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS
# It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
# By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
print("_split_generators", self.config.name)
urls = _URLS[self.config.name]
data_dir = dl_manager.download_and_extract(urls)
print("data_dir", data_dir)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["valid"],
"split": "dev",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
print("_generate_examples", filepath, split)
with open(filepath, encoding="utf-8") as f:
for line in f:
msg = json.loads(line)
if "events" in msg:
msg.pop("events")
if "emojis" in msg:
if msg["emojis"]:
msg["emojis"] = [{"name": k, "count": v} for k, v in msg["emojis"].items()]
else:
msg["emojis"] = None
if "labels" in msg:
if msg["labels"]:
msg["labels"] = [
{"name": k, "value": v["value"], "count": v["count"]} for k, v in msg["labels"].items()
]
else:
msg["labels"] = None
for optional_field in ('parent_id','rank', 'model_name', 'emojis', 'labels', 'detoxify', 'review_result'):
if optional_field not in msg:
msg[optional_field] = None
yield msg["message_id"], msg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment