Skip to content

Instantly share code, notes, and snippets.

@dartpain
Created March 8, 2024 16:04
Show Gist options
  • Save dartpain/8f5ac117d189a631d3b1599d95710ef6 to your computer and use it in GitHub Desktop.
Save dartpain/8f5ac117d189a631d3b1599d95710ef6 to your computer and use it in GitHub Desktop.
preparing embedding tuning
### Prep data
import json
import csv
import uuid
import random
# Initialize the dictionaries
train_queries = {}
train_corpus = {}
train_relevant_docs = {}
val_queries = {} # New validation datasets
val_corpus = {}
val_relevant_docs = {}
# Open the CSV file
with open('input.csv', 'r', encoding='utf-8', errors='replace') as csvfile:
csv_reader = csv.DictReader(csvfile)
# Process each row
for row in csv_reader:
question_uuid = str(uuid.uuid4())
context_uuid = str(uuid.uuid4())
context2_uuid = str(uuid.uuid4())
if len(row['Question']) > 2:
# Randomly assign to train (80%) or validation (20%)
if random.random() < 0.8:
# Add to train datasets
train_queries[question_uuid] = row['Question']
train_corpus[context_uuid] = row['Context']
train_corpus[context2_uuid] = row['C2']
train_relevant_docs.setdefault(question_uuid, []).extend([context_uuid, context2_uuid])
else:
# Add to validation datasets
val_queries[question_uuid] = row['Question']
val_corpus[context_uuid] = row['Context']
val_corpus[context2_uuid] = row['C2']
val_relevant_docs.setdefault(question_uuid, []).extend([context_uuid, context2_uuid])
with open('input.csv', 'r', encoding='utf-8', errors='replace') as csvfile:
csv_reader = csv.DictReader(csvfile)
# Process each row
for row in csv_reader:
question_uuid = str(uuid.uuid4())
context_uuid = str(uuid.uuid4())
context2_uuid = str(uuid.uuid4())
val_queries[question_uuid] = row['Question']
val_corpus[context_uuid] = row['Context']
val_corpus[context2_uuid] = row['C2']
val_relevant_docs.setdefault(question_uuid, []).extend([context_uuid, context2_uuid])
# Assemble the final dictionaries
train_dataset = {
'queries': train_queries,
'corpus': train_corpus,
'relevant_docs': train_relevant_docs,
}
val_dataset = {
'queries': val_queries,
'corpus': val_corpus,
'relevant_docs': val_relevant_docs,
}
# Print the result (optional)
with open("train_data.json", "w") as outfile:
json.dump(train_dataset, outfile, indent=4)
with open("val_data.json", "w") as outfile:
json.dump(val_dataset, outfile, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment