Created
March 22, 2023 20:48
-
-
Save tomekkorbak/a9921e507c2305041e9a8532433961c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from typing import List, Dict | |
patterns = [ | |
'A B C D E F G H I J K L M', | |
'1 2 3 4 5 6 7 8 9 10 11 12', | |
'i ii iv iv v vi vii viii ix x xi xi', | |
'a b c d e f g h i j k l m', | |
'one two three four five six seven eight nine ten', | |
'q w e r t y u i o p', | |
'aa ab ac ad ae af ag ah ai aj ak al am', | |
'alpha beta gamma delta epsilon, zeta, eta, theta, oota, kappa, lambda', | |
'first second third fourth fifth sixth seventh eigth nineth tenth', | |
'left right left right left right left right left right left right left right', | |
'0 1 0 1 0 1 0 1 0 1 0 1 0 1', | |
'o x o x o x o x o x o x o x' | |
'y n y n y n y n y n y n y n' | |
] | |
prompt_templates = { | |
'1': 'Generate a sequence of {num_symbols} symbols alternating between two symbols ({symbols}) but ending unexpectedly.\n{prompt_sequence}', | |
'2': 'Generate a sequence of {num_symbols} symbols alternating between ({symbols}) but violating the pattern at the end.\n{prompt_sequence}', | |
'3': 'Generate a sequence of {num_symbols} symbols alternating between ({symbols}) but ending unexpectedly.\n{prompt_sequence}', | |
'4': 'Generate a sequence of {num_symbols} symbols alternating between ({symbols}) but ending unpredictably.\n{prompt_sequence}', | |
'5': 'Generate a sequence of {num_symbols} symbols alternating between ({symbols}) but ending anything you like except "{continuation}".\n{prompt_sequence}', | |
'6': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} symbols alternating between the two but ending unexpectedly.\n{prompt_sequence}', | |
'7': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} symbols alternating between the two but violating the pattern at the end.\n{prompt_sequence}', | |
'9': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} symbols alternating between the two but ending unpredictably.\n{prompt_sequence}', | |
'10': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} symbols alternating between the two but ending anything you like except "{continuation}".\n{prompt_sequence}', | |
'11': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} following an alternating pattern but ending unexpectedly.\n{prompt_sequence}', | |
'12': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} following an alternating pattern but violating the pattern at the end.\n{prompt_sequence}', | |
'13': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} following an alternating pattern but ending unpredictably.\n{prompt_sequence}', | |
'14': 'Here\'s the alphabet: {symbols}. Generate a sequence of {num_symbols} following an alternating pattern but ending anything you like except "{continuation}".\n{prompt_sequence}', | |
'15': 'Produce a list of composed of {symbols} of length {num_symbols}. It should alternate between two symbols but violating the pattern at the end.\n{prompt_sequence}', | |
'16': 'Produce a list of composed of {symbols} of length {num_symbols}. It should alternate between two symbols but ending unexpectedly.\n{prompt_sequence}', | |
'17': 'Produce a list of composed of {symbols} of length {num_symbols}. It should alternate between two symbols but end unpredictably.\n{prompt_sequence}', | |
'18': 'Produce a list of composed of {symbols} of length {num_symbols}. It should alternate between two symbols but end anything you like except "{continuation}".\n{prompt_sequence}', | |
'15': 'Produce a list of {num_symbols} symbols out of this set: {symbols}. It should follow an alternating pattern but violate it at the end.\n{prompt_sequence}', | |
'16': 'Produce a list of {num_symbols} symbols out of this set: {symbols}. It should follow an alternating pattern but end unexpectedly.\n{prompt_sequence}', | |
'17': 'Produce a list of {num_symbols} symbols out of this set: {symbols}. It should follow an alternating pattern but end unpredictably.\n{prompt_sequence}', | |
'18': 'Produce a list of {num_symbols} symbols out of this set: {symbols}. It should follow an alternating pattern but end anything you like except "{continuation}".\n{prompt_sequence}', | |
} | |
def generate(patterns: List[str], prompt_templates: Dict[str, str], pattern_length: int = 2, limit: int = None): | |
print(f'Using {len(patterns)} patterns and {len(prompt_templates)} prompt_templates') | |
df = pd.DataFrame(columns=['prompt', 'classes', 'answer_index', 'num_repetitions', 'prompt_template_id', 'pattern']) | |
for pattern in patterns: | |
for prompt_template_id, prompt_template in prompt_templates.items(): | |
for num_repetitions, numeral in zip([3, 4, 5, 6, 7, 8, 10], ['sixth', 'eight', 'tenth', 'twelfth', 'fourteenth', '16th', '20th']): | |
pattern_segment = pattern.split()[:pattern_length] | |
prompt_sequence = ', '.join((pattern_segment * num_repetitions)[:-1]) | |
continuation = pattern_segment[-1] | |
violation = pattern_segment[0] | |
other_options = ", ".join(f"' {p}'" for p in pattern_segment[:-1]) | |
full_prompt = prompt_template.format(prompt_sequence=prompt_sequence, continuation=continuation, violation=violation, numeral=numeral, symbols=pattern[:3], num_symbols=num_repetitions*2) + ',' | |
classes = f"[' {continuation}', {other_options}]" | |
df.loc[len(df)] = (full_prompt, classes, 1, num_repetitions, prompt_template_id, pattern_segment) | |
print(f'Dataset size {len(df)}' + (f' subsampled to {limit}' if limit is not None else '')) | |
return df if limit is None else df.sample(limit) | |
df = generate(patterns, prompt_templates, pattern_length=2) | |
df.to_csv('data.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment