-
-
Save jkpe/534fef4b31a50b3d295fcbc7fe06197a to your computer and use it in GitHub Desktop.
Keypoints.py - Summarize a long text document to bullet points
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import openai | |
prompt = "I'm a busy CEO, please summarize the following in 10-20 bullet points. Include only the most important, useful or interesting information.\n\n" | |
# model = 'gpt-3.5-turbo' # 'gpt-4' # moved to gpt_chat_completion since we use both | |
CONTEXT_SIZE = 4096 # 2048 for GPT-3 (as half the input. | |
# CONTEXT_SIZE = 1024 # tiny context for testing | |
MAX_TOKENS_IN = int(CONTEXT_SIZE*2/3) | |
MAX_TOKENS_OUT = CONTEXT_SIZE - MAX_TOKENS_IN | |
def get_file_size(file_path): | |
try: | |
file_size = os.path.getsize(file_path) | |
return file_size | |
except OSError as e: | |
print(f"Error: {e}") | |
return None | |
def counttokens(text): | |
return int(len(text)/3) # todo tiktoken | |
def gpt(text, gpt4=False): | |
if gpt4: | |
return gpt_chat_completion(text, gpt4=True) | |
else: | |
return gpt_chat_completion(text) | |
def gpt_chat_completion(text, gpt4=False): | |
model = 'gpt-4' if gpt4 else 'gpt-3.5-turbo' | |
max_tokens_out = MAX_TOKENS_OUT # had to make a 2nd variable so we can double it if needed | |
if gpt4: | |
model = 'gpt-4' | |
max_tokens_out *= 2 | |
# print('===\n\nSummarize:\n\n' + text + '\n\n=====') | |
# this version gets the key points | |
# note: the prompt is already included in text | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": text} | |
] | |
temperature = 0 | |
# max_tokens = 500 | |
response = openai.ChatCompletion.create( | |
model=model, | |
messages=messages, | |
temperature=temperature, | |
max_tokens=max_tokens_out | |
) | |
return response.choices[0].message["content"] | |
def save_key_points(filename_in, filename_out, gpt4=False): | |
max_length_in = MAX_TOKENS_IN * 3 | |
if gpt4: | |
max_length_in *= 2 | |
chunk = prompt | |
with open(filename_in, "r", encoding='utf-8') as input_file, open(filename_out, "w", encoding='utf-8', buffering=1) as output_file: # buffering=1 means write each line | |
for line in input_file: | |
line = line.strip() | |
if len(chunk) + len(line) > max_length_in: | |
summary = gpt(chunk, gpt4) | |
output_file.write(summary + '\n\n') | |
# output_file.write(chunk) # debug | |
# exit() | |
chunk = prompt | |
chunk += ' ' + line | |
# chunk += '\n' + line # debug | |
# Generating the summary for the remaining chunk (if any) | |
if len(chunk) > len(prompt): | |
summary = gpt(chunk, gpt4) | |
output_file.write(summary + '\n\n') | |
# output_file.write(chunk) # debug | |
def main(): | |
print('starting') | |
if not os.path.exists('summary.txt'): | |
save_key_points('input.txt', 'summary.txt') | |
else: | |
print('Found summary.txt, skipping phase 1') | |
if not os.path.exists('summary-summary.txt'): | |
save_key_points('summary.txt', 'summary-summary.txt', gpt4=True) | |
else: | |
print('Found summary-summary.txt, skipping phase 2') | |
print('done') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment