Created April 28, 2023 09:00
import sys
import os
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import MarkdownTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from import (
from langchain.schema import (
human_template = """
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
system_text = """You are an expert technical editor specializing in machine learning books written for machine learning engineers and data scientist. You are especially good at cutting clutter.
- Improve grammar and language
- fix errors
- cut clutter
- keep tone and voice
- don't change markdown syntax, e.g. keep [@reference]
- never cut jokes
- output 1 line per sentence (same as input)
system_prompt = SystemMessage(content=system_text)
keyfile = "oai.key"
with open(keyfile, 'r') as f:
key =
# If you get timeouts, you might have to increase timeout parameter
llm = ChatOpenAI(openai_api_key=key, model="gpt-4", request_timeout=240)
def process_file(input_file):
output_file = os.path.splitext(input_file)[0] + ".qmd"
with open(input_file, 'r') as f:
content =
# Markdown splitter didn't work so well
#splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
docs = splitter.split_text(content)
print("Split into {} docs".format(len(docs)))
chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_message_prompt])
with open(output_file, 'w') as f:
for doc in docs:
result = llm(chat_prompt.format_prompt(text=doc).to_messages())
f.write(result.content + '\n')
print(f"Edited file saved as {output_file}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python input_file")
input_file = sys.argv[1]
