-
-
Save dartpain/8f5ac117d189a631d3b1599d95710ef6 to your computer and use it in GitHub Desktop.
preparing embedding tuning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Prep data | |
import json | |
import csv | |
import uuid | |
import random | |
# Initialize the dictionaries | |
train_queries = {} | |
train_corpus = {} | |
train_relevant_docs = {} | |
val_queries = {} # New validation datasets | |
val_corpus = {} | |
val_relevant_docs = {} | |
# Open the CSV file | |
with open('input.csv', 'r', encoding='utf-8', errors='replace') as csvfile: | |
csv_reader = csv.DictReader(csvfile) | |
# Process each row | |
for row in csv_reader: | |
question_uuid = str(uuid.uuid4()) | |
context_uuid = str(uuid.uuid4()) | |
context2_uuid = str(uuid.uuid4()) | |
if len(row['Question']) > 2: | |
# Randomly assign to train (80%) or validation (20%) | |
if random.random() < 0.8: | |
# Add to train datasets | |
train_queries[question_uuid] = row['Question'] | |
train_corpus[context_uuid] = row['Context'] | |
train_corpus[context2_uuid] = row['C2'] | |
train_relevant_docs.setdefault(question_uuid, []).extend([context_uuid, context2_uuid]) | |
else: | |
# Add to validation datasets | |
val_queries[question_uuid] = row['Question'] | |
val_corpus[context_uuid] = row['Context'] | |
val_corpus[context2_uuid] = row['C2'] | |
val_relevant_docs.setdefault(question_uuid, []).extend([context_uuid, context2_uuid]) | |
with open('input.csv', 'r', encoding='utf-8', errors='replace') as csvfile: | |
csv_reader = csv.DictReader(csvfile) | |
# Process each row | |
for row in csv_reader: | |
question_uuid = str(uuid.uuid4()) | |
context_uuid = str(uuid.uuid4()) | |
context2_uuid = str(uuid.uuid4()) | |
val_queries[question_uuid] = row['Question'] | |
val_corpus[context_uuid] = row['Context'] | |
val_corpus[context2_uuid] = row['C2'] | |
val_relevant_docs.setdefault(question_uuid, []).extend([context_uuid, context2_uuid]) | |
# Assemble the final dictionaries | |
train_dataset = { | |
'queries': train_queries, | |
'corpus': train_corpus, | |
'relevant_docs': train_relevant_docs, | |
} | |
val_dataset = { | |
'queries': val_queries, | |
'corpus': val_corpus, | |
'relevant_docs': val_relevant_docs, | |
} | |
# Print the result (optional) | |
with open("train_data.json", "w") as outfile: | |
json.dump(train_dataset, outfile, indent=4) | |
with open("val_data.json", "w") as outfile: | |
json.dump(val_dataset, outfile, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment