Skip to content

Instantly share code, notes, and snippets.

@miagkyi
Last active March 22, 2024 20:40
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miagkyi/fcb1b19284ab5a086fd593318cdf1046 to your computer and use it in GitHub Desktop.
Save miagkyi/fcb1b19284ab5a086fd593318cdf1046 to your computer and use it in GitHub Desktop.
Create funny tweets from 10k/q financial reports using gpt
from datetime import datetime, timedelta
import concurrent.futures
import csv
import html
import os
import time
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import nltk
import openai
import pandas as pd
from sec_api import ExtractorApi, QueryApi
import tiktoken
# Load environment variables for sensitive data and configuration
load_dotenv()
# Global configurations for tickers, API keys, and output settings
# Create a .env file with your sec and openai API keys
# SEC_API_KEY="..."
# OPENAI_API_KEY="sk-..."
# You can get your free api key here https://sec-api.io/signup/free
SEC_API_KEY = os.getenv("SEC_API_KEY")
# Your OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Add more tickers here
TICKERS = ["AAPL"]
FILING_URLS_FILE = "filing_urls.csv"
OUTPUT_BASE_DIR = datetime.now().strftime('%Y%m%d_%H%M%S')
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)
# Initialize SEC and OpenAI API clients
queryApi = QueryApi(api_key=SEC_API_KEY)
extractorApi = ExtractorApi(api_key=SEC_API_KEY)
openai.api_key = OPENAI_API_KEY
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
def get_date_range():
"""
Returns a date range of one week from today's date.
"""
end_date = datetime.now().strftime('%Y-%m-%d')
# Іelect dataframe here
start_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')
return start_date, end_date
def fetch_filing_urls(start_date, end_date):
"""
Fetches filing URLs from the SEC API for the specified
tickers within the given date range.
"""
base_query = {
"query": {
"query_string": {
"query": "PLACEHOLDER",
"time_zone": "America/New_York"
}
},
"from": "0",
"size": "200",
"sort": [{"filedAt": {"order": "desc"}}]
}
with open(FILING_URLS_FILE, "w", newline='') as log_file:
writer = csv.writer(log_file)
writer.writerow(['Company', 'Ticker', 'Filed Date', 'Report Year', 'Report Type', 'URL', 'Report Date'])
for ticker in TICKERS:
print(f"Starting download for ticker {ticker}")
universe_query = (
f'formType:(\"10-K\" OR \"10-Q\") AND '
f'filedAt:[{start_date} TO {end_date}] AND '
f'ticker:{ticker}'
)
base_query["query"]["query_string"]["query"] = universe_query
for from_batch in range(0, 9800, 200):
base_query["from"] = str(from_batch)
response = queryApi.get_filings(base_query)
if len(response["filings"]) == 0:
break
rows = [
[
x['companyName'],
ticker,
x['filedAt'],
int(x['filedAt'][:4]),
x['formType'],
x["linkToFilingDetails"],
x['filedAt']
]
for x in response["filings"]
]
writer.writerows(rows)
print(f"Filing URLs downloaded for {ticker}")
def mark_tables_in_html(html_content):
"""
Marks tables in the provided HTML content for easier processing later.
"""
soup = BeautifulSoup(html_content, 'html.parser')
for table in soup.find_all('table'):
table_str = '##TABLE_START\n' + str(table) + '\n##TABLE_END'
table.replace_with(BeautifulSoup(table_str, 'html.parser'))
return soup.get_text()
def split_text(input_text, token_limit=6000):
"""
Splits the text into sections ensuring that each section
is below the specified token limit so ChatGPT can process it.
"""
sections = []
current_section = ""
current_count = 0
table_flag = False
sentences = nltk.sent_tokenize(input_text)
for sentence in sentences:
tokens = nltk.word_tokenize(sentence)
if '##TABLE_START' in tokens:
table_flag = True
elif '##TABLE_END' in tokens:
table_flag = False
token_count = len(encoding.encode(sentence))
if current_count + token_count <= token_limit or table_flag:
current_section += sentence + " "
current_count += token_count
else:
sections.append(current_section.strip())
current_section = sentence + " "
current_count = token_count
if not table_flag and current_count + len(encoding.encode(current_section)) > token_limit:
sections.append(current_section.strip())
current_section = ""
current_count = 0
if current_section:
sections.append(current_section.strip())
return sections
def process_report(row):
"""
Extracts the Management Discussion & Analysis section
from a 10-K or 10-Q report and processes it.
"""
report_type = row['Report Type']
filing_url = row['URL']
if report_type == "10-K":
section_text = extractorApi.get_section(filing_url, '7', 'html')
elif report_type == "10-Q":
section_text = extractorApi.get_section(filing_url, 'part1item2', 'html')
else:
print(f"Unknown report type: {report_type} for company: {row['Company']}, year: {row['Report Year']}")
return
marked_text = mark_tables_in_html(section_text)
decoded_text = html.unescape(marked_text)
sections = split_text(decoded_text)
for section in sections:
with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow([row['Company'], row['Report Year'], report_type, row['Report Date'], section])
def summarize_row(row, index):
"""
Summarizes a row from the extracted report using gpt-3.5-16k
"""
while True:
try:
# Use the OpenAI API to summarize the text
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-16k",
messages = [
{
"role": "system",
"content": "You are an assistant."
},
{
"role": "user",
"content": (
f'This is a table/page from a Management Discussion & Analysis section '
f'of {row["Report Year"]} {row["Report Type"]} report from {row["Company"]} published. '
'Using only data provided below please write a short and structured executive summary, '
f'use numbers and relative metrics.\n\n Table/page:\n"{row["Section"]}"'
)
}
]
)
# Extract the assistant's reply
summarized_text = response['choices'][0]['message']['content']
return index, summarized_text
except Exception as e:
print(f"An error occurred: {e}. Retrying...")
time.sleep(5)
def generate_summaries_gpt35():
"""
Uses gpt-3.5-16k to generate summaries for
all the reports in 4 parallel streams.
"""
input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv')
df = pd.read_csv(input_file)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(summarize_row, row, index) for index, row in df.iterrows()]
for future in concurrent.futures.as_completed(futures):
index, summarized_text = future.result()
# Add the summarized text to the original dataframe
df.loc[index, 'Summarized'] = summarized_text
# Save the dataframe with the summarized text to a new csv file after each summary
output_file = os.path.join(OUTPUT_BASE_DIR, os.path.basename(input_file).split('.')[0] + '_gpt35_summary.csv')
df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False)
print(f"Total lines processed: {len(df[df['Summarized'].notnull()])}")
def create_3_tweets(row, index):
"""
Uses gpt-4 to generate 3 tweets per summary.
"""
while True:
try:
# Use the OpenAI API to write funny tweets
response = openai.ChatCompletion.create(
model="gpt-4",
messages = [
{
"role": "system",
"content": "You are one of the best comedians in the world writing hilarious jokes about the stocks."
},
{
"role": "user",
"content": (
f'Write 3 funny and sarcastic tweets about {row["Company"]} '
f'performance based on the summary of their {row["Report Type"]} '
f'financial report for {row["Report Year"]} below. '
'Make sure to use numbers and metrics, be insightful. '
'Try to be really creative, mix satire, sarcasm, unexpectedness, '
'exaggeration, provocation and risk to create the top jokes:'
f'\n"{row["Summarized"]}"'
)
}
]
)
summarized_text = response['choices'][0]['message']['content']
return index, summarized_text
except Exception as e:
print(f"An error occurred: {e}. Retrying...")
time.sleep(5)
def generate_tweets_gpt4():
"""
Uses gpt-4 to generate tweets for all the summaries in 2 parallel streams.
"""
# Adjust this path to match where the summarized reports are stored
input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports_gpt35_summary.csv')
df = pd.read_csv(input_file, encoding='utf-8')
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = [executor.submit(create_3_tweets, row, index) for index, row in df.iterrows()]
for future in concurrent.futures.as_completed(futures):
index, tweet_text = future.result()
df.loc[index, 'Tweets'] = tweet_text
output_file = input_file.split('.')[0] + '_gpt4_tweets.csv'
df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False, encoding='utf-8')
print(f"Total lines processed: {len(df[df['Tweets'].notnull()])}")
def main():
# Downloading the required dataset for sentence tokenization
nltk.download('punkt')
# Fetch the date range and filing URLs
start_date, end_date = get_date_range()
fetch_filing_urls(start_date, end_date)
# Initialize an empty DataFrame to store the filings
filings_df = pd.read_csv(FILING_URLS_FILE)
# Initialize the CSV file to store all reports
with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Company', 'Report Year', 'Report Type', 'Report Date', 'Section'])
# Process each report directly
for _, row in filings_df.iterrows():
process_report(row)
# Summarize the reports
generate_summaries_gpt35()
# Create funny tweets
generate_tweets_gpt4()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment