Created
March 20, 2024 13:19
-
-
Save neosavvy/dbd7a75c71f5b194f9e341829fefe803 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import instructor | |
import time | |
from langchain_community.vectorstores import FAISS | |
from langchain_openai import OpenAIEmbeddings | |
from langchain.text_splitter import TokenTextSplitter | |
from langchain_community.document_loaders import UnstructuredURLLoader | |
from openai import OpenAI | |
from pydantic import BaseModel | |
from pydantic import Field | |
from enum import Enum | |
from typing import Optional, Union, List | |
class UnitSuffix(str, Enum): | |
billion = 'Billion' | |
million = 'Million' | |
thousand = 'Thousand' | |
unknown = '' | |
class FiscalPeriod(str, Enum): | |
fy_2023 = 'FY2023' | |
fy_2022 = 'FY2022' | |
fy_2021 = 'FY2021' | |
fy_2020 = 'FY2020' | |
unknown = '' | |
# Define our income statement | |
class IncomeStatement(BaseModel): | |
period: Optional[FiscalPeriod] | |
revenue: Union[float, str] = Field(description="Revenue") | |
revenue_unit: Optional[UnitSuffix] | |
cost_of_revenue: Union[float, str] = Field(description="Cost of revenue") | |
cost_of_revenue_unit: Optional[UnitSuffix] | |
income_from_operations: Union[float, str] = Field(description="Income from operations") | |
income_from_operations_unit: Optional[UnitSuffix] | |
operations_and_support: Union[float, str] = Field(description="Operations and support") | |
operations_and_support_unit: Optional[UnitSuffix] | |
product_development: Union[float, str] = Field(description="Product development") | |
product_development_unit: Optional[UnitSuffix] | |
sales_and_marketing: Union[float, str] = Field(description="Sales and marketing") | |
sales_and_marketing_unit: Optional[UnitSuffix] | |
general_and_administrative: Union[float, str] = Field(description="General and administrative") | |
general_and_administrative_unit: Optional[UnitSuffix] | |
interest_income: Union[float, str] = Field(description="Interest income") | |
interest_income_unit: Optional[UnitSuffix] | |
interest_expense: Union[float, str] = Field(description="Interest expense") | |
interest_expense_unit: Optional[UnitSuffix] | |
other_income: Union[float, str] = Field(description="Other income") | |
other_income_unit: Optional[UnitSuffix] | |
net_income: Union[float, str] = Field(description="Net income") | |
net_income_unit: Optional[UnitSuffix] | |
class IncomeStatements(BaseModel): | |
income_statements: List[IncomeStatement] | |
# Airbnb | |
# url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm" | |
# Apple | |
url = "https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/a10-k20189292018.htm" | |
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'your-org your@org.com'}) | |
documents = loader.load() | |
# Naively chunk the SEC filing by tokens | |
token_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=20) | |
docs = token_splitter.split_documents(documents) | |
vectorstore = FAISS.from_documents(docs, OpenAIEmbeddings(model="text-embedding-3-large")) | |
query = "Consolidated Statements of Operations (in millions)" | |
# Get documents from the vector DB | |
k = 1 | |
top_k_docs = vectorstore.similarity_search(query, k) | |
context = "\n".join([doc.page_content for doc in top_k_docs]) | |
start = time.time() | |
client = instructor.patch(OpenAI()) | |
income_statements = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
response_model=instructor.Partial[IncomeStatements], | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"Extract the company's income statement from 2023, 2022, and 2021 " | |
f"from following context: {context}", | |
}, | |
], | |
) | |
print(f"Took {time.time() - start} seconds to complete!") | |
print(income_statements.model_dump_json(indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment