Skip to content

Instantly share code, notes, and snippets.

@appleparan
Created March 4, 2024 15:27
Show Gist options
  • Save appleparan/5b70f9022ed247b03936e55e90d970ba to your computer and use it in GitHub Desktop.
Save appleparan/5b70f9022ed247b03936e55e90d970ba to your computer and use it in GitHub Desktop.
import pandas as pd
from datasets import load_dataset_builder, load_dataset, Dataset
def main(dataset_name1 = 'kyujinpy/OpenOrca-KO', dataset_name2 = 'Open-Orca/OpenOrca', split='train'):
# Load the dataset builder
builder1 = load_dataset_builder(dataset_name1)
builder2 = load_dataset_builder(dataset_name2)
# kyujinpy/OpenOrca-KO
# id, input, instruction, output
print(f"Features of {dataset_name1}:")
print(builder1.info.features)
cols1 = ['id', 'input', 'instruction', 'output']
# Open-Orca/OpenOrca
# id, system_prompt, question, response
print(f"Features of {dataset_name2}:")
print(builder2.info.features)
cols2 = ['id', 'system_prompt', 'question', 'response']
cols1_convert = {
'id': 'id',
'input': 'question_ko',
'instruction': 'system_prompt_ko',
'output': 'response_ko'
}
cols2_convert = {
'id': 'id',
'question': 'question_en',
'system_prompt': 'system_prompt_en',
'response': 'response_en'
}
# Load the dataset
dataset1 = load_dataset(dataset_name1)[split]
dataset2 = load_dataset(dataset_name2)[split]
# Find ids that are in both datasets
ids1 = set(dataset1['id'])
ids2 = set(dataset2['id'])
common_ids = ids1.intersection(ids2)
# Find the common rows
common_rows1 = dataset1.filter(lambda x: x['id'] in common_ids)
common_rows2 = dataset2.filter(lambda x: x['id'] in common_ids)
# Convert the common rows to pandas dataframes
df1 = common_rows1.data.to_pandas()
df2 = common_rows2.data.to_pandas()
# Drop duplicates
df1.drop_duplicates(subset=['id'], inplace=True)
df2.drop_duplicates(subset=['id'], inplace=True)
# Rename the columns to dataset2's column names
df1 = df1.rename(columns=cols1_convert)
print(df1.head(5))
df2 = df2.rename(columns=cols2_convert)
print(df2.head(5))
# Merge the dataframes
merged_df = pd.merge(df1, df2, on='id')
# Save the merged dataframe to a JSONL file
merged_df.to_json('merged.jsonl', orient='records', lines=True)
# Reorder columns
new_cols = ['id', 'system_prompt_ko', 'question_ko', 'response_ko', 'system_prompt_en', 'question_en', 'response_en']
merged_df = merged_df[new_cols]
merged_ids = merged_df['id']
# ids have 'category'.'id' format
# I want to count the number of items in each category
categories = merged_ids.apply(lambda x: x.split('.')[0])
print(categories.value_counts())
dataset = Dataset.from_pandas(merged_df, split=split)
dataset.push_to_hub("appleparan/OpenOrca-Ko-En")
# Print dataset info
print(dataset.info)
if __name__ == '__main__':
main()
datasets
pandas
huggingface_hub
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment