Created
April 26, 2023 23:04
-
-
Save avelican/11f429d0bb62891439f144bd7c0941a4 to your computer and use it in GitHub Desktop.
Keypoints.py - Summarize a long text document to bullet points
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import openai | |
prompt = "I'm a busy CEO, please summarize the following in 10-20 bullet points. Include only the most important, useful or interesting information.\n\n" | |
# model = 'gpt-3.5-turbo' # 'gpt-4' # moved to gpt_chat_completion since we use both | |
CONTEXT_SIZE = 4096 # 2048 for GPT-3 (as half the input. | |
# CONTEXT_SIZE = 1024 # tiny context for testing | |
MAX_TOKENS_IN = int(CONTEXT_SIZE*2/3) | |
MAX_TOKENS_OUT = CONTEXT_SIZE - MAX_TOKENS_IN | |
def get_file_size(file_path): | |
try: | |
file_size = os.path.getsize(file_path) | |
return file_size | |
except OSError as e: | |
print(f"Error: {e}") | |
return None | |
def counttokens(text): | |
return int(len(text)/3) # todo tiktoken | |
def gpt(text, gpt4=False): | |
if gpt4: | |
return gpt_chat_completion(text, gpt4=True) | |
else: | |
return gpt_chat_completion(text) | |
def gpt_chat_completion(text, gpt4=False): | |
model = 'gpt-4' if gpt4 else 'gpt-3.5-turbo' | |
max_tokens_out = MAX_TOKENS_OUT # had to make a 2nd variable so we can double it if needed | |
if gpt4: | |
model = 'gpt-4' | |
max_tokens_out *= 2 | |
# print('===\n\nSummarize:\n\n' + text + '\n\n=====') | |
# this version gets the key points | |
# note: the prompt is already included in text | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": text} | |
] | |
temperature = 0 | |
# max_tokens = 500 | |
response = openai.ChatCompletion.create( | |
model=model, | |
messages=messages, | |
temperature=temperature, | |
max_tokens=max_tokens_out | |
) | |
return response.choices[0].message["content"] | |
def save_key_points(filename_in, filename_out, gpt4=False): | |
max_length_in = MAX_TOKENS_IN * 3 | |
if gpt4: | |
max_length_in *= 2 | |
chunk = prompt | |
with open(filename_in, "r", encoding='utf-8') as input_file, open(filename_out, "w", encoding='utf-8', buffering=1) as output_file: # buffering=1 means write each line | |
for line in input_file: | |
line = line.strip() | |
if len(chunk) + len(line) > max_length_in: | |
summary = gpt(chunk, gpt4) | |
output_file.write(summary + '\n\n') | |
# output_file.write(chunk) # debug | |
# exit() | |
chunk = prompt | |
chunk += ' ' + line | |
# chunk += '\n' + line # debug | |
# Generating the summary for the remaining chunk (if any) | |
if len(chunk) > len(prompt): | |
summary = gpt(chunk, gpt4) | |
output_file.write(summary + '\n\n') | |
# output_file.write(chunk) # debug | |
def main(): | |
print('starting') | |
if not os.path.exists('summary.txt'): | |
save_key_points('input.txt', 'summary.txt') | |
else: | |
print('Found summary.txt, skipping phase 1') | |
if not os.path.exists('summary-summary.txt'): | |
save_key_points('summary.txt', 'summary-summary.txt', gpt4=True) | |
else: | |
print('Found summary-summary.txt, skipping phase 2') | |
print('done') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment