Skip to content

Instantly share code, notes, and snippets.

@neosavvy
Created March 20, 2024 13:29
Show Gist options
  • Save neosavvy/3fafdeb0fabc3ba5ff2c5e2a8f6dfe1c to your computer and use it in GitHub Desktop.
Save neosavvy/3fafdeb0fabc3ba5ff2c5e2a8f6dfe1c to your computer and use it in GitHub Desktop.
import instructor
import time
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from openai import OpenAI
from pydantic import BaseModel
from pydantic import Field
from enum import Enum
from typing import Optional, Union, List
class UnitSuffix(str, Enum):
billion = 'Billion'
million = 'Million'
thousand = 'Thousand'
unknown = ''
class FiscalPeriod(str, Enum):
fy_2023 = 'FY2023'
fy_2022 = 'FY2022'
fy_2021 = 'FY2021'
fy_2020 = 'FY2020'
unknown = ''
# Define our income statement
class IncomeStatement(BaseModel):
period: Optional[FiscalPeriod]
revenue: Union[float, str] = Field(description="Revenue")
revenue_unit: Optional[UnitSuffix]
cost_of_revenue: Union[float, str] = Field(description="Cost of revenue")
cost_of_revenue_unit: Optional[UnitSuffix]
income_from_operations: Union[float, str] = Field(description="Income from operations")
income_from_operations_unit: Optional[UnitSuffix]
operations_and_support: Union[float, str] = Field(description="Operations and support")
operations_and_support_unit: Optional[UnitSuffix]
product_development: Union[float, str] = Field(description="Product development")
product_development_unit: Optional[UnitSuffix]
sales_and_marketing: Union[float, str] = Field(description="Sales and marketing")
sales_and_marketing_unit: Optional[UnitSuffix]
general_and_administrative: Union[float, str] = Field(description="General and administrative")
general_and_administrative_unit: Optional[UnitSuffix]
interest_income: Union[float, str] = Field(description="Interest income")
interest_income_unit: Optional[UnitSuffix]
interest_expense: Union[float, str] = Field(description="Interest expense")
interest_expense_unit: Optional[UnitSuffix]
other_income: Union[float, str] = Field(description="Other income")
other_income_unit: Optional[UnitSuffix]
net_income: Union[float, str] = Field(description="Net income")
net_income_unit: Optional[UnitSuffix]
class IncomeStatements(BaseModel):
income_statements: List[IncomeStatement]
# Airbnb
# url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm"
# Apple
url = "https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'your-org your@org.com'})
documents = loader.load()
# Naively chunk the SEC filing by tokens
token_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=200)
docs = token_splitter.split_documents(documents)
vectorstore = FAISS.from_documents(docs, OpenAIEmbeddings(model="text-embedding-3-large"))
query = "Consolidated Statements of Operations (in millions)"
# Get documents from the vector DB
k = 1
top_k_docs = vectorstore.similarity_search(query, k)
context = "\n".join([doc.page_content for doc in top_k_docs])
start = time.time()
client = instructor.patch(OpenAI())
income_statements = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=instructor.Partial[IncomeStatements],
messages=[
{
"role": "user",
"content": f"Extract the company's income statement from 2023, 2022, and 2021 "
f"from following context: {context}",
},
],
)
print(f"Took {time.time() - start} seconds to complete!")
print(income_statements.model_dump_json(indent=2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment