Skip to content

Instantly share code, notes, and snippets.

@jkpe
Forked from avelican/keypoints.py
Created July 4, 2023 12:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jkpe/534fef4b31a50b3d295fcbc7fe06197a to your computer and use it in GitHub Desktop.
Save jkpe/534fef4b31a50b3d295fcbc7fe06197a to your computer and use it in GitHub Desktop.
Keypoints.py - Summarize a long text document to bullet points
import os
import openai
prompt = "I'm a busy CEO, please summarize the following in 10-20 bullet points. Include only the most important, useful or interesting information.\n\n"
# model = 'gpt-3.5-turbo' # 'gpt-4' # moved to gpt_chat_completion since we use both
CONTEXT_SIZE = 4096 # 2048 for GPT-3 (as half the input.
# CONTEXT_SIZE = 1024 # tiny context for testing
MAX_TOKENS_IN = int(CONTEXT_SIZE*2/3)
MAX_TOKENS_OUT = CONTEXT_SIZE - MAX_TOKENS_IN
def get_file_size(file_path):
try:
file_size = os.path.getsize(file_path)
return file_size
except OSError as e:
print(f"Error: {e}")
return None
def counttokens(text):
return int(len(text)/3) # todo tiktoken
def gpt(text, gpt4=False):
if gpt4:
return gpt_chat_completion(text, gpt4=True)
else:
return gpt_chat_completion(text)
def gpt_chat_completion(text, gpt4=False):
model = 'gpt-4' if gpt4 else 'gpt-3.5-turbo'
max_tokens_out = MAX_TOKENS_OUT # had to make a 2nd variable so we can double it if needed
if gpt4:
model = 'gpt-4'
max_tokens_out *= 2
# print('===\n\nSummarize:\n\n' + text + '\n\n=====')
# this version gets the key points
# note: the prompt is already included in text
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": text}
]
temperature = 0
# max_tokens = 500
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens_out
)
return response.choices[0].message["content"]
def save_key_points(filename_in, filename_out, gpt4=False):
max_length_in = MAX_TOKENS_IN * 3
if gpt4:
max_length_in *= 2
chunk = prompt
with open(filename_in, "r", encoding='utf-8') as input_file, open(filename_out, "w", encoding='utf-8', buffering=1) as output_file: # buffering=1 means write each line
for line in input_file:
line = line.strip()
if len(chunk) + len(line) > max_length_in:
summary = gpt(chunk, gpt4)
output_file.write(summary + '\n\n')
# output_file.write(chunk) # debug
# exit()
chunk = prompt
chunk += ' ' + line
# chunk += '\n' + line # debug
# Generating the summary for the remaining chunk (if any)
if len(chunk) > len(prompt):
summary = gpt(chunk, gpt4)
output_file.write(summary + '\n\n')
# output_file.write(chunk) # debug
def main():
print('starting')
if not os.path.exists('summary.txt'):
save_key_points('input.txt', 'summary.txt')
else:
print('Found summary.txt, skipping phase 1')
if not os.path.exists('summary-summary.txt'):
save_key_points('summary.txt', 'summary-summary.txt', gpt4=True)
else:
print('Found summary-summary.txt, skipping phase 2')
print('done')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment