miagkyi/10k_reports_to_tweets.py

## 10k_reports_to_tweets.py
from datetime import datetime, timedelta
import concurrent.futures
import csv
import html
import os
import time

from bs4 import BeautifulSoup
from dotenv import load_dotenv
import nltk
import openai
import pandas as pd
from sec_api import ExtractorApi, QueryApi
import tiktoken

# Load environment variables for sensitive data and configuration
load_dotenv()

# Global configurations for tickers, API keys, and output settings

# Create a .env file with your sec and openai API keys
# SEC_API_KEY="..."
# OPENAI_API_KEY="sk-..."

# You can get your free api key here https://sec-api.io/signup/free
SEC_API_KEY = os.getenv("SEC_API_KEY")

# Your OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Add more tickers here
TICKERS = ["AAPL"]

FILING_URLS_FILE = "filing_urls.csv"
OUTPUT_BASE_DIR = datetime.now().strftime('%Y%m%d_%H%M%S')
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

# Initialize SEC and OpenAI API clients
queryApi = QueryApi(api_key=SEC_API_KEY)
extractorApi = ExtractorApi(api_key=SEC_API_KEY)
openai.api_key = OPENAI_API_KEY
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


def get_date_range():
    """
    Returns a date range of one week from today's date.
    """
    end_date = datetime.now().strftime('%Y-%m-%d')
    # Іelect dataframe here
    start_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')
    return start_date, end_date


def fetch_filing_urls(start_date, end_date):
    """
    Fetches filing URLs from the SEC API for the specified
    tickers within the given date range.
    """

    base_query = {
        "query": {
            "query_string": {
                "query": "PLACEHOLDER",
                "time_zone": "America/New_York"
            }
        },
        "from": "0",
        "size": "200",
        "sort": [{"filedAt": {"order": "desc"}}]
    }

    with open(FILING_URLS_FILE, "w", newline='') as log_file:
        writer = csv.writer(log_file)
        writer.writerow(['Company', 'Ticker', 'Filed Date', 'Report Year', 'Report Type', 'URL', 'Report Date'])

        for ticker in TICKERS:
            print(f"Starting download for ticker {ticker}")

            universe_query = (
                f'formType:(\"10-K\" OR \"10-Q\") AND '
                f'filedAt:[{start_date} TO {end_date}] AND '
                f'ticker:{ticker}'
            )
            base_query["query"]["query_string"]["query"] = universe_query

            for from_batch in range(0, 9800, 200):
                base_query["from"] = str(from_batch)
                response = queryApi.get_filings(base_query)

                if len(response["filings"]) == 0:
                    break

                rows = [
                    [
                        x['companyName'],
                        ticker,
                        x['filedAt'],
                        int(x['filedAt'][:4]),
                        x['formType'],
                        x["linkToFilingDetails"],
                        x['filedAt']
                    ]
                    for x in response["filings"]
                ]
                writer.writerows(rows)

            print(f"Filing URLs downloaded for {ticker}")


def mark_tables_in_html(html_content):
    """
    Marks tables in the provided HTML content for easier processing later.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    for table in soup.find_all('table'):
        table_str = '##TABLE_START\n' + str(table) + '\n##TABLE_END'
        table.replace_with(BeautifulSoup(table_str, 'html.parser'))
    return soup.get_text()


def split_text(input_text, token_limit=6000):
    """
    Splits the text into sections ensuring that each section
    is below the specified token limit so ChatGPT can process it.
    """
    sections = []
    current_section = ""
    current_count = 0
    table_flag = False

    sentences = nltk.sent_tokenize(input_text)

    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        if '##TABLE_START' in tokens:
            table_flag = True
        elif '##TABLE_END' in tokens:
            table_flag = False

        token_count = len(encoding.encode(sentence))
        if current_count + token_count <= token_limit or table_flag:
            current_section += sentence + " "
            current_count += token_count
        else:
            sections.append(current_section.strip())
            current_section = sentence + " "
            current_count = token_count

        if not table_flag and current_count + len(encoding.encode(current_section)) > token_limit:
            sections.append(current_section.strip())
            current_section = ""
            current_count = 0

    if current_section:
        sections.append(current_section.strip())

    return sections


def process_report(row):
    """
    Extracts the Management Discussion & Analysis section
    from a 10-K or 10-Q report and processes it.
    """
    report_type = row['Report Type']
    filing_url = row['URL']

    if report_type == "10-K":
        section_text = extractorApi.get_section(filing_url, '7', 'html')
    elif report_type == "10-Q":
        section_text = extractorApi.get_section(filing_url, 'part1item2', 'html')
    else:
        print(f"Unknown report type: {report_type} for company: {row['Company']}, year: {row['Report Year']}")
        return

    marked_text = mark_tables_in_html(section_text)
    decoded_text = html.unescape(marked_text)
    sections = split_text(decoded_text)

    for section in sections:
        with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([row['Company'], row['Report Year'], report_type, row['Report Date'], section])


def summarize_row(row, index):
    """
    Summarizes a row from the extracted report using gpt-3.5-16k
    """
    while True:
        try:
            # Use the OpenAI API to summarize the text
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo-16k",
                messages = [
                    {
                        "role": "system",
                        "content": "You are an assistant."
                    },
                    {
                        "role": "user",
                        "content": (
                            f'This is a table/page from a Management Discussion & Analysis section '
                            f'of {row["Report Year"]} {row["Report Type"]} report from {row["Company"]} published. '
                            'Using only data provided below please write a short and structured executive summary, '
                            f'use numbers and relative metrics.\n\n Table/page:\n"{row["Section"]}"'
                        )
                    }
                ]
            )
            # Extract the assistant's reply
            summarized_text = response['choices'][0]['message']['content']
            return index, summarized_text

        except Exception as e:
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(5)


def generate_summaries_gpt35():
    """
    Uses gpt-3.5-16k to generate summaries for
    all the reports in 4 parallel streams.
    """
    input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv')
    df = pd.read_csv(input_file)

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(summarize_row, row, index) for index, row in df.iterrows()]

        for future in concurrent.futures.as_completed(futures):
            index, summarized_text = future.result()

            # Add the summarized text to the original dataframe
            df.loc[index, 'Summarized'] = summarized_text

            # Save the dataframe with the summarized text to a new csv file after each summary
            output_file = os.path.join(OUTPUT_BASE_DIR, os.path.basename(input_file).split('.')[0] + '_gpt35_summary.csv')
            df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False)
            print(f"Total lines processed: {len(df[df['Summarized'].notnull()])}")


def create_3_tweets(row, index):
    """
    Uses gpt-4 to generate 3 tweets per summary.
    """
    while True:
        try:
            # Use the OpenAI API to write funny tweets
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages = [
                    {
                        "role": "system",
                        "content": "You are one of the best comedians in the world writing hilarious jokes about the stocks."
                    },
                    {
                        "role": "user",
                        "content": (
                            f'Write 3 funny and sarcastic tweets about {row["Company"]} '
                            f'performance based on the summary of their {row["Report Type"]} '
                            f'financial report for {row["Report Year"]} below. '
                            'Make sure to use numbers and metrics, be insightful. '
                            'Try to be really creative, mix satire, sarcasm, unexpectedness, '
                            'exaggeration, provocation and risk to create the top jokes:'
                            f'\n"{row["Summarized"]}"'
                        )
                    }
                ]
            )
            summarized_text = response['choices'][0]['message']['content']
            return index, summarized_text

        except Exception as e:
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(5)


def generate_tweets_gpt4():
    """
    Uses gpt-4 to generate tweets for all the summaries in 2 parallel streams.
    """
    # Adjust this path to match where the summarized reports are stored
    input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports_gpt35_summary.csv')
    df = pd.read_csv(input_file, encoding='utf-8')

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(create_3_tweets, row, index) for index, row in df.iterrows()]

        for future in concurrent.futures.as_completed(futures):
            index, tweet_text = future.result()
            df.loc[index, 'Tweets'] = tweet_text

            output_file = input_file.split('.')[0] + '_gpt4_tweets.csv'
            df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False, encoding='utf-8')
            print(f"Total lines processed: {len(df[df['Tweets'].notnull()])}")


def main():
    # Downloading the required dataset for sentence tokenization
    nltk.download('punkt')

    # Fetch the date range and filing URLs
    start_date, end_date = get_date_range()
    fetch_filing_urls(start_date, end_date)

    # Initialize an empty DataFrame to store the filings
    filings_df = pd.read_csv(FILING_URLS_FILE)

    # Initialize the CSV file to store all reports
    with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Company', 'Report Year', 'Report Type', 'Report Date', 'Section'])

    # Process each report directly
    for _, row in filings_df.iterrows():
        process_report(row)

    # Summarize the reports
    generate_summaries_gpt35()

    # Create funny tweets
    generate_tweets_gpt4()


if __name__ == '__main__':
    main()
	from datetime import datetime, timedelta
	import concurrent.futures
	import csv
	import html
	import os
	import time

	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	import nltk
	import openai
	import pandas as pd
	from sec_api import ExtractorApi, QueryApi
	import tiktoken

	# Load environment variables for sensitive data and configuration
	load_dotenv()

	# Global configurations for tickers, API keys, and output settings

	# Create a .env file with your sec and openai API keys
	# SEC_API_KEY="..."
	# OPENAI_API_KEY="sk-..."

	# You can get your free api key here https://sec-api.io/signup/free
	SEC_API_KEY = os.getenv("SEC_API_KEY")

	# Your OpenAI API key
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	# Add more tickers here
	TICKERS = ["AAPL"]

	FILING_URLS_FILE = "filing_urls.csv"
	OUTPUT_BASE_DIR = datetime.now().strftime('%Y%m%d_%H%M%S')
	os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

	# Initialize SEC and OpenAI API clients
	queryApi = QueryApi(api_key=SEC_API_KEY)
	extractorApi = ExtractorApi(api_key=SEC_API_KEY)
	openai.api_key = OPENAI_API_KEY
	encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


	def get_date_range():
	"""
	Returns a date range of one week from today's date.
	"""
	end_date = datetime.now().strftime('%Y-%m-%d')
	# Іelect dataframe here
	start_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')
	return start_date, end_date


	def fetch_filing_urls(start_date, end_date):
	"""
	Fetches filing URLs from the SEC API for the specified
	tickers within the given date range.
	"""

	base_query = {
	"query": {
	"query_string": {
	"query": "PLACEHOLDER",
	"time_zone": "America/New_York"
	}
	},
	"from": "0",
	"size": "200",
	"sort": [{"filedAt": {"order": "desc"}}]
	}

	with open(FILING_URLS_FILE, "w", newline='') as log_file:
	writer = csv.writer(log_file)
	writer.writerow(['Company', 'Ticker', 'Filed Date', 'Report Year', 'Report Type', 'URL', 'Report Date'])

	for ticker in TICKERS:
	print(f"Starting download for ticker {ticker}")

	universe_query = (
	f'formType:(\"10-K\" OR \"10-Q\") AND '
	f'filedAt:[{start_date} TO {end_date}] AND '
	f'ticker:{ticker}'
	)
	base_query["query"]["query_string"]["query"] = universe_query

	for from_batch in range(0, 9800, 200):
	base_query["from"] = str(from_batch)
	response = queryApi.get_filings(base_query)

	if len(response["filings"]) == 0:
	break

	rows = [
	[
	x['companyName'],
	ticker,
	x['filedAt'],
	int(x['filedAt'][:4]),
	x['formType'],
	x["linkToFilingDetails"],
	x['filedAt']
	]
	for x in response["filings"]
	]
	writer.writerows(rows)

	print(f"Filing URLs downloaded for {ticker}")


	def mark_tables_in_html(html_content):
	"""
	Marks tables in the provided HTML content for easier processing later.
	"""
	soup = BeautifulSoup(html_content, 'html.parser')
	for table in soup.find_all('table'):
	table_str = '##TABLE_START\n' + str(table) + '\n##TABLE_END'
	table.replace_with(BeautifulSoup(table_str, 'html.parser'))
	return soup.get_text()


	def split_text(input_text, token_limit=6000):
	"""
	Splits the text into sections ensuring that each section
	is below the specified token limit so ChatGPT can process it.
	"""
	sections = []
	current_section = ""
	current_count = 0
	table_flag = False

	sentences = nltk.sent_tokenize(input_text)

	for sentence in sentences:
	tokens = nltk.word_tokenize(sentence)
	if '##TABLE_START' in tokens:
	table_flag = True
	elif '##TABLE_END' in tokens:
	table_flag = False

	token_count = len(encoding.encode(sentence))
	if current_count + token_count <= token_limit or table_flag:
	current_section += sentence + " "
	current_count += token_count
	else:
	sections.append(current_section.strip())
	current_section = sentence + " "
	current_count = token_count

	if not table_flag and current_count + len(encoding.encode(current_section)) > token_limit:
	sections.append(current_section.strip())
	current_section = ""
	current_count = 0

	if current_section:
	sections.append(current_section.strip())

	return sections


	def process_report(row):
	"""
	Extracts the Management Discussion & Analysis section
	from a 10-K or 10-Q report and processes it.
	"""
	report_type = row['Report Type']
	filing_url = row['URL']

	if report_type == "10-K":
	section_text = extractorApi.get_section(filing_url, '7', 'html')
	elif report_type == "10-Q":
	section_text = extractorApi.get_section(filing_url, 'part1item2', 'html')
	else:
	print(f"Unknown report type: {report_type} for company: {row['Company']}, year: {row['Report Year']}")
	return

	marked_text = mark_tables_in_html(section_text)
	decoded_text = html.unescape(marked_text)
	sections = split_text(decoded_text)

	for section in sections:
	with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'a', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow([row['Company'], row['Report Year'], report_type, row['Report Date'], section])


	def summarize_row(row, index):
	"""
	Summarizes a row from the extracted report using gpt-3.5-16k
	"""
	while True:
	try:
	# Use the OpenAI API to summarize the text
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo-16k",
	messages = [
	{
	"role": "system",
	"content": "You are an assistant."
	},
	{
	"role": "user",
	"content": (
	f'This is a table/page from a Management Discussion & Analysis section '
	f'of {row["Report Year"]} {row["Report Type"]} report from {row["Company"]} published. '
	'Using only data provided below please write a short and structured executive summary, '
	f'use numbers and relative metrics.\n\n Table/page:\n"{row["Section"]}"'
	)
	}
	]
	)
	# Extract the assistant's reply
	summarized_text = response['choices'][0]['message']['content']
	return index, summarized_text

	except Exception as e:
	print(f"An error occurred: {e}. Retrying...")
	time.sleep(5)


	def generate_summaries_gpt35():
	"""
	Uses gpt-3.5-16k to generate summaries for
	all the reports in 4 parallel streams.
	"""
	input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv')
	df = pd.read_csv(input_file)

	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
	futures = [executor.submit(summarize_row, row, index) for index, row in df.iterrows()]

	for future in concurrent.futures.as_completed(futures):
	index, summarized_text = future.result()

	# Add the summarized text to the original dataframe
	df.loc[index, 'Summarized'] = summarized_text

	# Save the dataframe with the summarized text to a new csv file after each summary
	output_file = os.path.join(OUTPUT_BASE_DIR, os.path.basename(input_file).split('.')[0] + '_gpt35_summary.csv')
	df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False)
	print(f"Total lines processed: {len(df[df['Summarized'].notnull()])}")


	def create_3_tweets(row, index):
	"""
	Uses gpt-4 to generate 3 tweets per summary.
	"""
	while True:
	try:
	# Use the OpenAI API to write funny tweets
	response = openai.ChatCompletion.create(
	model="gpt-4",
	messages = [
	{
	"role": "system",
	"content": "You are one of the best comedians in the world writing hilarious jokes about the stocks."
	},
	{
	"role": "user",
	"content": (
	f'Write 3 funny and sarcastic tweets about {row["Company"]} '
	f'performance based on the summary of their {row["Report Type"]} '
	f'financial report for {row["Report Year"]} below. '
	'Make sure to use numbers and metrics, be insightful. '
	'Try to be really creative, mix satire, sarcasm, unexpectedness, '
	'exaggeration, provocation and risk to create the top jokes:'
	f'\n"{row["Summarized"]}"'
	)
	}
	]
	)
	summarized_text = response['choices'][0]['message']['content']
	return index, summarized_text

	except Exception as e:
	print(f"An error occurred: {e}. Retrying...")
	time.sleep(5)


	def generate_tweets_gpt4():
	"""
	Uses gpt-4 to generate tweets for all the summaries in 2 parallel streams.
	"""
	# Adjust this path to match where the summarized reports are stored
	input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports_gpt35_summary.csv')
	df = pd.read_csv(input_file, encoding='utf-8')

	with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
	futures = [executor.submit(create_3_tweets, row, index) for index, row in df.iterrows()]

	for future in concurrent.futures.as_completed(futures):
	index, tweet_text = future.result()
	df.loc[index, 'Tweets'] = tweet_text

	output_file = input_file.split('.')[0] + '_gpt4_tweets.csv'
	df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False, encoding='utf-8')
	print(f"Total lines processed: {len(df[df['Tweets'].notnull()])}")


	def main():
	# Downloading the required dataset for sentence tokenization
	nltk.download('punkt')

	# Fetch the date range and filing URLs
	start_date, end_date = get_date_range()
	fetch_filing_urls(start_date, end_date)

	# Initialize an empty DataFrame to store the filings
	filings_df = pd.read_csv(FILING_URLS_FILE)

	# Initialize the CSV file to store all reports
	with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow(['Company', 'Report Year', 'Report Type', 'Report Date', 'Section'])

	# Process each report directly
	for _, row in filings_df.iterrows():
	process_report(row)

	# Summarize the reports
	generate_summaries_gpt35()

	# Create funny tweets
	generate_tweets_gpt4()


	if __name__ == '__main__':
	main()