TheodoreGalanos/domain_knowledge_generation_gpt4.py

## domain_knowledge_generation_gpt4.py
import os
import openai
from jinja2 import Template, meta, Environment
from dotenv import load_dotenv
load_dotenv() # add a .env file with the following
# setup is for azure, change accordingly for normal openai
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_base = os.getenv("OPENAI_API_BASE")

def get_ninja_vars(jinja_template):
    env = Environment()
    ast = env.parse(jinja_template)
    return list(meta.find_undeclared_variables(ast))

def chat_completion(system, user, assistant, temperature: float, max_tokens: int):
    response = openai.ChatCompletion.create(
        deployment_id=os.getenv("OPENAI_DEPLOYMENT_NAME")
        messages = [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant},
       ],
       temperature = temperature,
       max_tokens = max_tokens
  return response.choices[0].message.content

def create_md(result_json):
    md_string = f"# Challenge Description\n{result_json['challenge_description']}\n\n# Selected Challenge Areas\n"
    for area in result_json['areas']:
        area_md_string = f"## {area['challenge']}\n\n### Description\n\n{area['challenge_desc']}.\n\n### Counterfactual Probing\n\n"
        for i, _ in enumerate(list(area.keys())[2:-2][0::2]):
            probe_key, probe_answer = f"probing_{i}", f"probing_{i}_answer"
            probe_md = f"#### {ascii_uppercase[i]}. {area[probe_key]}\n\n{area[probe_answer]}\n\n"
            area_md_string = area_md_string + probe_md
        ending_md = f"#### Summary\n\n{area['cf_summary']}\n\n### Answer\n{area['answer']}\n\n"
        area_md_string = area_md_string + ending_md
        md_string = md_string + area_md_string
    return md_string

### Templates for interactions ###

## Step 1: Create a knowledge challenge ##
knowledge_challenge_system = """You are a world renown expert in the field of {{ capability }}. You are a writer of tests, challenges, and deep knowledge on {{ capability }} for practitioners in the field to gain deep insights and critical understanding from. The focus of your challenges is on {{ discipline }}. The challenges you write should incite deep contemplation and rely on, and relate to, multiple branching aspects and contexts of the topic chosen into the knowledge challenge. You always write the output of your response in json, in the following way:
{
    "challenge_description": "detailed description of the challenge",
    "areas": [
        {"name": "area name", "description": "description of the challenge"},
        {"name": "area name", "description": "description of the challenge"},
        ...
    ]
}"""
knowledge_challenge_system_template = Template(knowledge_challenge_system)

knowledge_challenge_user = """Create a knowledge challenge at the intersection of {{ capability }} and {{ discipline }}. You should write one challenge for at least five (5) important areas of the field.""" # can change the number of areas here, gpt4 can easily do 10.
knowledge_challenge_user_template = Template(knowledge_challenge_user)

knowledge_challenge_assistant = "JSON file containing the challenge details:"

## Step 2: Counterfactual Questions across each area ##
cf_questions_system = """You are a world renown expert in the field of {{ capability }}. You are a writer of tests, challenges, questions, and deep knowledge on {{ capability }} for practitioners in the field to gain deep insights and understanding from. The counterfactual questions you write incite deep contemplation and critical probing of complex subjects, and allow users to identify and relate to multiple branching aspects and contexts of the subject.
You always write the output of your response in json, in the following way:
{
    "question_1": "the first counterfactual question",
    "question_2": "the second counterfactual question",
    "question_3": "the third counterfactual questions",
    ...
}"""
cf_questions_system_template = Template(cf_questions_system)

cf_questions_user = """You are given a challenge designed to elicit complex reasoning and evaluate knowledge and expertise in the domain of {{ capability }}.

Each challenge is split into {{ n }} parts, where each part is trying to evaluate an important aspect or area of the domain.

Your task is to pose five (5) detailed counterfactual questions for the selected area of the challenge.

Challenge description: {{ challenge_description }}

Important areas:
{{ areas }}

Selected area: {{ selected_area }}"""
cf_questions_user_template = Template(cf_questions_user)

cf_questions_assistant = "Counterfactual Questions JSON:"

## Step 3: Counterfactual Answers ##
cf_answers_system = """You are a world renown expert in the field of {{ capability }}. You are given a complex challenge at the intersection of {{ capability }} and {{ discipline }}, designed to elicit complex reasoning and evaluate knowledge and expertise in {{ capability }}. The challenge is broken down to a number of important areas that aim to probe crucial aspects of the challenge, along with a range of varied professional experiences and expertise.

Your are also given a counterfactual question about a selected area. Your task is to write a long, explanatory answer to the question, always explaining the full context of your answer (i.e. related concepts, practical implications, external references), as well as the step-by-step thinking process you take to answer the question. Your answers should be written in a factual and dense style, thorough, but explained in a way someone new to the domain would understand. You should also discuss why other competing views may have merit.
Respond always in a well-formatted JSON, as in the example below:
{
    "answer": "the answer to the counterfactual question"
}"""
cf_answers_system_template = Template(cf_answers_system)

cf_answers_user = """A counterfactual question, for a specific area of a knowledge challenge, is given below along with challenge details.

Challenge description: {{ challenge_description }}

Important areas:
{{ areas }}

Selected area: {{ selected_area }}

Counterfactual question:
{{ question }}
"""
cf_answers_user_template = Template(cf_answers_user)

cf_answers_assistant = "Counterfactual Answer JSON:"

## Step 4: Counterfactual Summary ##
cf_summary_system = """You are given a set of counterfactuals (pair of questions and answers), for one of the {{ n }} areas of a knowledge challenge in the domain of {{ capability }}.

Your task is to consider all these counterfactuals and create a summary that contains analysis of the following aspects:
1. The most important concerns in this area.
2. What could practically go wrong and why.
3. Main considerations for interventions.

Make the summary a standalone, free flowing text that merges the above aspects naturally and doesn't require context to be understood, without referring to the counterfactuals at the start."""
cf_summary_system_template = Template(cf_summary_system)

cf_summary_user = """Challenge: {challenge_description}

Selected area: {selected_area}

Counterfactuals:
{counterfactuals}"""
cf_summary_user_template = Template(cf_summary_user)

cf_summary_assistant = "Summary:"

## Step 5: Answer Challenge ##
answer_challenge_system = """You are a world renown expert in the field of {{ capability }}. You are given a complex challenge at the intersection of {{ capability }} and {{ discipline }}. The challenge can be broken down to different parts that need to be answered.

Your task is to provide a long, comprehensive answer to a given part of the challenge. You will need to think through the implications the current area has on the challenge as a whole, and highlight its connection to the other areas in the challenge. To help you craft the answer, a summary of counterfactual arguments is provided. Using all this information, create a rationale as to why this area of the challenge is important, and then formulate your answer.

While performing the task you should think step-by-step and justify your steps. Provide a detailed answer so that someone who reads the answer doesn't need to search outside to understand it."""
answer_challenge_system_template = Template(answer_challenge_system)

answer_challenge_user = """Challenge:
{challenge_description}
Areas:
{areas}
Current area:
{selected_area}
Counterfactual arguments:
{counterfactual_summary}
"""
answer_challenge_user_template = Template(answer_challenge_user)

answer_challenge_assistant = "Answer:"


### Generate Challenge ###
capability = "Asset Management and Performance"
discipline = "Asset Reliability"

## Step 1: Challenge Generation ##
system = knowledge_challenge_system_template.render(
    {
        "capability": capability,
        "discipline": discipline
    }
)

user = knowledge_challenge_system_template.render(
    {
        "capability": capability,
        "discipline": discipline
    }
)
results_json = dict()
response = chat_completion(system, user, knowledge_challenge_assistant, temperature=0.7, max_tokens=1000)
# process response
response = re.sub('\s+',' ', response).replace('\n', '')
challenge = ast.literal_eval(response)
subtasks = [item['name'] for item in challenge['areas']]
subtasks_desc = [item['description'] for item in challenge['areas']]
challenge_description = challenge['challenge_description']
print('\n--- Challenge Description finished! ---\n')
print(challenge)

results_json['challenge_description'] = challenge_description
results_json['areas'] = []
for i, (task, desc) in enumerate(zip(subtasks, subtasks_desc)):
    area_dict = dict()
    area_dict[f'challenge'] = task
    area_dict[f'challenge_desc'] = desc

    ## Step 2: Counterfactual Questions Generation ##
    system = cf_questions_system_template.render(
        {
            "capability": capability
        }
    )
    user = cf_questions_user_template.render(
        {
            "capability": capability,
            "n": str(len(subtasks_desc)),
            "challenge_description": challenge_description,
            "areas": '\n'.join(f'Area: {a}\\nTask: {q}\n' for a,q in zip(subtasks, subtasks_desc)),
            "selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}'
        }
    )
    #print(user)
    cf_questions = chat_completion(system, user, cf_questions_assistant, temperature=0.7, max_tokens=3000)

    cf_questions = re.sub('\s+',' ', cf_questions).replace('\n', '')
    cf_questions = ast.literal_eval(cf_questions)
    print(cf_questions)
    time.sleep(10)
    # print(cf_questions[list(cf_questions.keys())[1]])
    # break
    # Step 3: Counterfactual Answers Generation ##
    for j, value in enumerate(cf_questions.values()):

        questions, answers = [], []
        system = cf_answers_system_template.render(
            {
                "capability": capability,
                "discipline": discipline,

            }
        )
        user = cf_answers_user_template.render(
            {
                "challenge_description": challenge_description,
                "areas": '\n'.join(f'Area: {a}\nTask Description: {q}' for a,q in zip(subtasks, subtasks_desc)),
                "selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}',
                "question": value
            }
        )
        #print(user)
        cf_answer = chat_completion(system, user, cf_answers_assistant, temperature=0.7, max_tokens=4000)
        cf_answer = re.sub('\s+',' ', cf_answer).replace('\n', '')
        cf_answer = ast.literal_eval(cf_answer)
        area_dict[f'probing_{j}'] = value
        area_dict[f'probing_{j}_answer'] = cf_answer['answer']
        questions.append(value)
        answers.append(cf_answer['answer'])
        time.sleep(10)
    print(area_dict)

    print('\n--- Counterfactual Answers finished! ---\n')

    ## Step 4: Counterfactual Summary Generation ##
    system = cf_summary_system_template.render(
        {
            "capability": capability,
            "n": str(len(subtasks_desc)),
        }
    )
    user = cf_summary_user_template.render(
        {
            "challenge_description": challenge_description,
            "selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}',
            "counterfactuals": ''.join([f"Counterfactual question: {q}\nAnswer: {a}\n\n" for q,a in zip(questions, answers)])
        }
    )
    cf_summary = chat_completion(system, user, cf_summary_assistant, temperature=0.7, max_tokens=800)
    area_dict['cf_summary'] = cf_summary
    print(cf_summary)
    print('\n--- Counterfactual Summary finished! ---\n')
    time.sleep(10)
    ## Step 5: Answer Challenge Generation ##
    system = answer_challenge_system_template.render(
        {
            "capability": capability,
            "discipline": discipline,

        }
    )
    user = answer_challenge_user_template.render(
        {
            "challenge_description": challenge_description,
            "areas": '\n'.join(f'Area: {a}\nQuestion: {q}' for a,q in zip(subtasks, subtasks_desc)),
            "selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}',
            "counterfactual_summary": cf_summary
        }
    )
    challenge_answer = chat_completion(system, user, answer_challenge_assistant, temperature=0.7, max_tokens=3000)
    print(challenge_answer)
    area_dict['answer'] = challenge_answer
    results_json['areas'].append(area_dict)
    time.sleep(20)
with open(f'{discipline}_{capability}_challenge_gpt4.json', 'w', encoding='utf8') as out_f:
    json.dump(results_json , out_f)
md_string = create_md(results_json)
with open(f'{discipline}_{capability}_gpt4.md', 'w', encoding='utf8') as out_f:
    out_f.write(md_string)
	import os
	import openai
	from jinja2 import Template, meta, Environment
	from dotenv import load_dotenv
	load_dotenv() # add a .env file with the following
	# setup is for azure, change accordingly for normal openai
	openai.api_key = os.getenv("OPENAI_API_KEY")
	openai.api_type = os.getenv("OPENAI_API_TYPE")
	openai.api_version = os.getenv("OPENAI_API_VERSION")
	openai.api_base = os.getenv("OPENAI_API_BASE")

	def get_ninja_vars(jinja_template):
	env = Environment()
	ast = env.parse(jinja_template)
	return list(meta.find_undeclared_variables(ast))

	def chat_completion(system, user, assistant, temperature: float, max_tokens: int):
	response = openai.ChatCompletion.create(
	deployment_id=os.getenv("OPENAI_DEPLOYMENT_NAME")
	messages = [
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	{"role": "assistant", "content": assistant},
	],
	temperature = temperature,
	max_tokens = max_tokens
	return response.choices[0].message.content

	def create_md(result_json):
	md_string = f"# Challenge Description\n{result_json['challenge_description']}\n\n# Selected Challenge Areas\n"
	for area in result_json['areas']:
	area_md_string = f"## {area['challenge']}\n\n### Description\n\n{area['challenge_desc']}.\n\n### Counterfactual Probing\n\n"
	for i, _ in enumerate(list(area.keys())[2:-2][0::2]):
	probe_key, probe_answer = f"probing_{i}", f"probing_{i}_answer"
	probe_md = f"#### {ascii_uppercase[i]}. {area[probe_key]}\n\n{area[probe_answer]}\n\n"
	area_md_string = area_md_string + probe_md
	ending_md = f"#### Summary\n\n{area['cf_summary']}\n\n### Answer\n{area['answer']}\n\n"
	area_md_string = area_md_string + ending_md
	md_string = md_string + area_md_string
	return md_string

	### Templates for interactions ###

	## Step 1: Create a knowledge challenge ##
	knowledge_challenge_system = """You are a world renown expert in the field of {{ capability }}. You are a writer of tests, challenges, and deep knowledge on {{ capability }} for practitioners in the field to gain deep insights and critical understanding from. The focus of your challenges is on {{ discipline }}. The challenges you write should incite deep contemplation and rely on, and relate to, multiple branching aspects and contexts of the topic chosen into the knowledge challenge. You always write the output of your response in json, in the following way:
	{
	"challenge_description": "detailed description of the challenge",
	"areas": [
	{"name": "area name", "description": "description of the challenge"},
	{"name": "area name", "description": "description of the challenge"},
	...
	]
	}"""
	knowledge_challenge_system_template = Template(knowledge_challenge_system)

	knowledge_challenge_user = """Create a knowledge challenge at the intersection of {{ capability }} and {{ discipline }}. You should write one challenge for at least five (5) important areas of the field.""" # can change the number of areas here, gpt4 can easily do 10.
	knowledge_challenge_user_template = Template(knowledge_challenge_user)

	knowledge_challenge_assistant = "JSON file containing the challenge details:"

	## Step 2: Counterfactual Questions across each area ##
	cf_questions_system = """You are a world renown expert in the field of {{ capability }}. You are a writer of tests, challenges, questions, and deep knowledge on {{ capability }} for practitioners in the field to gain deep insights and understanding from. The counterfactual questions you write incite deep contemplation and critical probing of complex subjects, and allow users to identify and relate to multiple branching aspects and contexts of the subject.
	You always write the output of your response in json, in the following way:
	{
	"question_1": "the first counterfactual question",
	"question_2": "the second counterfactual question",
	"question_3": "the third counterfactual questions",
	...
	}"""
	cf_questions_system_template = Template(cf_questions_system)

	cf_questions_user = """You are given a challenge designed to elicit complex reasoning and evaluate knowledge and expertise in the domain of {{ capability }}.

	Each challenge is split into {{ n }} parts, where each part is trying to evaluate an important aspect or area of the domain.

	Your task is to pose five (5) detailed counterfactual questions for the selected area of the challenge.

	Challenge description: {{ challenge_description }}

	Important areas:
	{{ areas }}

	Selected area: {{ selected_area }}"""
	cf_questions_user_template = Template(cf_questions_user)

	cf_questions_assistant = "Counterfactual Questions JSON:"

	## Step 3: Counterfactual Answers ##
	cf_answers_system = """You are a world renown expert in the field of {{ capability }}. You are given a complex challenge at the intersection of {{ capability }} and {{ discipline }}, designed to elicit complex reasoning and evaluate knowledge and expertise in {{ capability }}. The challenge is broken down to a number of important areas that aim to probe crucial aspects of the challenge, along with a range of varied professional experiences and expertise.

	Your are also given a counterfactual question about a selected area. Your task is to write a long, explanatory answer to the question, always explaining the full context of your answer (i.e. related concepts, practical implications, external references), as well as the step-by-step thinking process you take to answer the question. Your answers should be written in a factual and dense style, thorough, but explained in a way someone new to the domain would understand. You should also discuss why other competing views may have merit.
	Respond always in a well-formatted JSON, as in the example below:
	{
	"answer": "the answer to the counterfactual question"
	}"""
	cf_answers_system_template = Template(cf_answers_system)

	cf_answers_user = """A counterfactual question, for a specific area of a knowledge challenge, is given below along with challenge details.

	Challenge description: {{ challenge_description }}

	Important areas:
	{{ areas }}

	Selected area: {{ selected_area }}

	Counterfactual question:
	{{ question }}
	"""
	cf_answers_user_template = Template(cf_answers_user)

	cf_answers_assistant = "Counterfactual Answer JSON:"

	## Step 4: Counterfactual Summary ##
	cf_summary_system = """You are given a set of counterfactuals (pair of questions and answers), for one of the {{ n }} areas of a knowledge challenge in the domain of {{ capability }}.

	Your task is to consider all these counterfactuals and create a summary that contains analysis of the following aspects:
	1. The most important concerns in this area.
	2. What could practically go wrong and why.
	3. Main considerations for interventions.

	Make the summary a standalone, free flowing text that merges the above aspects naturally and doesn't require context to be understood, without referring to the counterfactuals at the start."""
	cf_summary_system_template = Template(cf_summary_system)

	cf_summary_user = """Challenge: {challenge_description}

	Selected area: {selected_area}

	Counterfactuals:
	{counterfactuals}"""
	cf_summary_user_template = Template(cf_summary_user)

	cf_summary_assistant = "Summary:"

	## Step 5: Answer Challenge ##
	answer_challenge_system = """You are a world renown expert in the field of {{ capability }}. You are given a complex challenge at the intersection of {{ capability }} and {{ discipline }}. The challenge can be broken down to different parts that need to be answered.

	Your task is to provide a long, comprehensive answer to a given part of the challenge. You will need to think through the implications the current area has on the challenge as a whole, and highlight its connection to the other areas in the challenge. To help you craft the answer, a summary of counterfactual arguments is provided. Using all this information, create a rationale as to why this area of the challenge is important, and then formulate your answer.

	While performing the task you should think step-by-step and justify your steps. Provide a detailed answer so that someone who reads the answer doesn't need to search outside to understand it."""
	answer_challenge_system_template = Template(answer_challenge_system)

	answer_challenge_user = """Challenge:
	{challenge_description}
	Areas:
	{areas}
	Current area:
	{selected_area}
	Counterfactual arguments:
	{counterfactual_summary}
	"""
	answer_challenge_user_template = Template(answer_challenge_user)

	answer_challenge_assistant = "Answer:"


	### Generate Challenge ###
	capability = "Asset Management and Performance"
	discipline = "Asset Reliability"

	## Step 1: Challenge Generation ##
	system = knowledge_challenge_system_template.render(
	{
	"capability": capability,
	"discipline": discipline
	}
	)

	user = knowledge_challenge_system_template.render(
	{
	"capability": capability,
	"discipline": discipline
	}
	)
	results_json = dict()
	response = chat_completion(system, user, knowledge_challenge_assistant, temperature=0.7, max_tokens=1000)
	# process response
	response = re.sub('\s+',' ', response).replace('\n', '')
	challenge = ast.literal_eval(response)
	subtasks = [item['name'] for item in challenge['areas']]
	subtasks_desc = [item['description'] for item in challenge['areas']]
	challenge_description = challenge['challenge_description']
	print('\n--- Challenge Description finished! ---\n')
	print(challenge)

	results_json['challenge_description'] = challenge_description
	results_json['areas'] = []
	for i, (task, desc) in enumerate(zip(subtasks, subtasks_desc)):
	area_dict = dict()
	area_dict[f'challenge'] = task
	area_dict[f'challenge_desc'] = desc

	## Step 2: Counterfactual Questions Generation ##
	system = cf_questions_system_template.render(
	{
	"capability": capability
	}
	)
	user = cf_questions_user_template.render(
	{
	"capability": capability,
	"n": str(len(subtasks_desc)),
	"challenge_description": challenge_description,
	"areas": '\n'.join(f'Area: {a}\\nTask: {q}\n' for a,q in zip(subtasks, subtasks_desc)),
	"selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}'
	}
	)
	#print(user)
	cf_questions = chat_completion(system, user, cf_questions_assistant, temperature=0.7, max_tokens=3000)

	cf_questions = re.sub('\s+',' ', cf_questions).replace('\n', '')
	cf_questions = ast.literal_eval(cf_questions)
	print(cf_questions)
	time.sleep(10)
	# print(cf_questions[list(cf_questions.keys())[1]])
	# break
	# Step 3: Counterfactual Answers Generation ##
	for j, value in enumerate(cf_questions.values()):

	questions, answers = [], []
	system = cf_answers_system_template.render(
	{
	"capability": capability,
	"discipline": discipline,

	}
	)
	user = cf_answers_user_template.render(
	{
	"challenge_description": challenge_description,
	"areas": '\n'.join(f'Area: {a}\nTask Description: {q}' for a,q in zip(subtasks, subtasks_desc)),
	"selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}',
	"question": value
	}
	)
	#print(user)
	cf_answer = chat_completion(system, user, cf_answers_assistant, temperature=0.7, max_tokens=4000)
	cf_answer = re.sub('\s+',' ', cf_answer).replace('\n', '')
	cf_answer = ast.literal_eval(cf_answer)
	area_dict[f'probing_{j}'] = value
	area_dict[f'probing_{j}_answer'] = cf_answer['answer']
	questions.append(value)
	answers.append(cf_answer['answer'])
	time.sleep(10)
	print(area_dict)

	print('\n--- Counterfactual Answers finished! ---\n')

	## Step 4: Counterfactual Summary Generation ##
	system = cf_summary_system_template.render(
	{
	"capability": capability,
	"n": str(len(subtasks_desc)),
	}
	)
	user = cf_summary_user_template.render(
	{
	"challenge_description": challenge_description,
	"selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}',
	"counterfactuals": ''.join([f"Counterfactual question: {q}\nAnswer: {a}\n\n" for q,a in zip(questions, answers)])
	}
	)
	cf_summary = chat_completion(system, user, cf_summary_assistant, temperature=0.7, max_tokens=800)
	area_dict['cf_summary'] = cf_summary
	print(cf_summary)
	print('\n--- Counterfactual Summary finished! ---\n')
	time.sleep(10)
	## Step 5: Answer Challenge Generation ##
	system = answer_challenge_system_template.render(
	{
	"capability": capability,
	"discipline": discipline,

	}
	)
	user = answer_challenge_user_template.render(
	{
	"challenge_description": challenge_description,
	"areas": '\n'.join(f'Area: {a}\nQuestion: {q}' for a,q in zip(subtasks, subtasks_desc)),
	"selected_area" : f'Area: {subtasks[i]}\n{subtasks_desc[i]}',
	"counterfactual_summary": cf_summary
	}
	)
	challenge_answer = chat_completion(system, user, answer_challenge_assistant, temperature=0.7, max_tokens=3000)
	print(challenge_answer)
	area_dict['answer'] = challenge_answer
	results_json['areas'].append(area_dict)
	time.sleep(20)
	with open(f'{discipline}_{capability}_challenge_gpt4.json', 'w', encoding='utf8') as out_f:
	json.dump(results_json , out_f)
	md_string = create_md(results_json)
	with open(f'{discipline}_{capability}_gpt4.md', 'w', encoding='utf8') as out_f:
	out_f.write(md_string)