Skip to content

Instantly share code, notes, and snippets.

@mxchinegod
Created June 18, 2023 13:41
Show Gist options
  • Save mxchinegod/6489446b65d5961652cae755b72f0223 to your computer and use it in GitHub Desktop.
Save mxchinegod/6489446b65d5961652cae755b72f0223 to your computer and use it in GitHub Desktop.
Disgusting SEC Filing ripper
import requests
from tqdm import tqdm
import feedparser
from bs4 import BeautifulSoup
from time import sleep
import os
from pprint import pprint
#!python3 -m pip install feedparser
def fancy_message(tag, body):
tags = [
("FATAL", "☠️", "\033[91m"), # Red color for FATAL
("WARN", "🚨", "\033[93m"), # Yellow color for WARN
("INFO", "ℹ️", "\033[94m"), # Blue color for INFO
("WAIT", "☕️", "\033[96m") # Cyan color for WAIT
]
matching_tags = [x for x in tags if x[0] == tag.upper()]
if matching_tags:
tag_text = matching_tags[0][0]
emoji = matching_tags[0][1]
color_code = matching_tags[0][2]
print(f'{color_code}{emoji} {tag_text}: {body}\033[0m') # Reset color after the text
else:
print(f'Unknown tag: {tag}')
from datetime import datetime
def print_current_datetime():
now = datetime.now()
date_string = now.strftime("%B %d, %Y")
time_string = now.strftime("%I:%M %p")
return f'{date_string} {time_string}'
while True:
# Fetch the RSS feed
user_agent = "<company> <email>"
feedparser.USER_AGENT = user_agent
rss_feed_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=1000&output=atom"
feed = feedparser.parse(rss_feed_url, request_headers={"User-Agent": user_agent})
filings = []
# Iterate over the entries in the RSS feed
for entry in feed.entries:
# Get the URL from the entry
url = entry.link
title = entry.title
updated = entry.updated
_dict = {
"url": url
, "title": title
, "updated": updated
, "docs": []
}
filings.append(_dict)
# Make a request to the URL
headers = {
"User-Agent": user_agent,
"Accept-Encoding": "gzip, deflate",
"Host": "www.sec.gov"
}
# Create a tqdm progress bar
progress_bar = tqdm(filings, desc="Processing filings")
for filing in progress_bar:
sleep(.5)
response = requests.get(filing["url"], headers=headers)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find all <a> tags with href attributes
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
# Check if the href has htm, xml, or pdf extension
if href.endswith((".htm", ".xml", ".pdf", ".jpg", ".png")) and '/Archives/edgar/data/' in href:
filing["docs"].append(href)
# Update the progress bar
progress_bar.set_postfix({"Docs Found": len(filing["docs"])})
headers = {
"User-Agent": user_agent,
"Accept-Encoding": "gzip, deflate",
"Host": "www.sec.gov"
}
folder = 'sec_filings'
if not os.path.exists(folder):
os.makedirs(folder)
progress_bar = tqdm(filings, desc="Downloading filings")
for item in progress_bar:
docs = item['docs']
title = item['title']
for doc in docs:
url = 'https://www.sec.gov' + doc
file_name = title.replace("/","_") + '_' + doc.split('/')[-1]
file_path = os.path.join(folder, file_name)
response = requests.get(url, stream=True, headers=headers)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
block_size = 1024 # 1KB
with open(file_path, 'wb') as file:
for data in response.iter_content(block_size):
file.write(data)
progress_bar.set_postfix({"Attachments found": len(docs)})
progress_bar.close()
fancy_message('info', f'Downloads completed @ {print_current_datetime()}')
fancy_message('wait','sleeping for 2 minutes')
sleep(120)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment