mxchinegod/sec_ripper.py

## sec_ripper.py
import requests
from tqdm import tqdm
import feedparser
from bs4 import BeautifulSoup
from time import sleep
import os
from pprint import pprint
#!python3 -m pip install feedparser

def fancy_message(tag, body):
    tags = [
        ("FATAL", "☠️", "\033[91m"),  # Red color for FATAL
        ("WARN", "🚨", "\033[93m"),   # Yellow color for WARN
        ("INFO", "ℹ️", "\033[94m"),   # Blue color for INFO
        ("WAIT", "☕️", "\033[96m")    # Cyan color for WAIT
    ]
    matching_tags = [x for x in tags if x[0] == tag.upper()]

    if matching_tags:
        tag_text = matching_tags[0][0]
        emoji = matching_tags[0][1]
        color_code = matching_tags[0][2]
        print(f'{color_code}{emoji} {tag_text}: {body}\033[0m')  # Reset color after the text
    else:
        print(f'Unknown tag: {tag}')

from datetime import datetime

def print_current_datetime():
    now = datetime.now()
    date_string = now.strftime("%B %d, %Y")
    time_string = now.strftime("%I:%M %p")
    return f'{date_string} {time_string}'

while True:
    # Fetch the RSS feed
    user_agent = "<company> <email>"
    feedparser.USER_AGENT = user_agent

    rss_feed_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=1000&output=atom"
    feed = feedparser.parse(rss_feed_url, request_headers={"User-Agent": user_agent})

    filings = []
    # Iterate over the entries in the RSS feed
    for entry in feed.entries:
        # Get the URL from the entry
        url = entry.link
        title = entry.title
        updated = entry.updated
        _dict = {
            "url": url
            , "title": title
            , "updated": updated
            , "docs": []
        }
        filings.append(_dict)
        # Make a request to the URL

    headers = {
        "User-Agent": user_agent,
        "Accept-Encoding": "gzip, deflate",
        "Host": "www.sec.gov"
    }

    # Create a tqdm progress bar
    progress_bar = tqdm(filings, desc="Processing filings")

    for filing in progress_bar:
        sleep(.5)
        response = requests.get(filing["url"], headers=headers)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, "html.parser")
            # Find all <a> tags with href attributes
            for a_tag in soup.find_all("a", href=True):
                href = a_tag["href"]
                # Check if the href has htm, xml, or pdf extension
                if href.endswith((".htm", ".xml", ".pdf", ".jpg", ".png")) and '/Archives/edgar/data/' in href:
                    filing["docs"].append(href)
        # Update the progress bar
        progress_bar.set_postfix({"Docs Found": len(filing["docs"])})

    headers = {
        "User-Agent": user_agent,
        "Accept-Encoding": "gzip, deflate",
        "Host": "www.sec.gov"
    }

    folder = 'sec_filings'

    if not os.path.exists(folder):
        os.makedirs(folder)

    progress_bar = tqdm(filings, desc="Downloading filings")

    for item in progress_bar:
        docs = item['docs']
        title = item['title']

        for doc in docs:
            url = 'https://www.sec.gov' + doc
            file_name = title.replace("/","_") + '_' + doc.split('/')[-1]
            file_path = os.path.join(folder, file_name)

            response = requests.get(url, stream=True, headers=headers)
            response.raise_for_status()

            total_size = int(response.headers.get('content-length', 0))
            block_size = 1024  # 1KB

            with open(file_path, 'wb') as file:
                for data in response.iter_content(block_size):
                    file.write(data)
            progress_bar.set_postfix({"Attachments found": len(docs)})
    progress_bar.close()

    fancy_message('info', f'Downloads completed @ {print_current_datetime()}')
    fancy_message('wait','sleeping for 2 minutes')
    sleep(120)
	import requests
	from tqdm import tqdm
	import feedparser
	from bs4 import BeautifulSoup
	from time import sleep
	import os
	from pprint import pprint
	#!python3 -m pip install feedparser

	def fancy_message(tag, body):
	tags = [
	("FATAL", "☠️", "\033[91m"), # Red color for FATAL
	("WARN", "🚨", "\033[93m"), # Yellow color for WARN
	("INFO", "ℹ️", "\033[94m"), # Blue color for INFO
	("WAIT", "☕️", "\033[96m") # Cyan color for WAIT
	]
	matching_tags = [x for x in tags if x[0] == tag.upper()]

	if matching_tags:
	tag_text = matching_tags[0][0]
	emoji = matching_tags[0][1]
	color_code = matching_tags[0][2]
	print(f'{color_code}{emoji} {tag_text}: {body}\033[0m') # Reset color after the text
	else:
	print(f'Unknown tag: {tag}')

	from datetime import datetime

	def print_current_datetime():
	now = datetime.now()
	date_string = now.strftime("%B %d, %Y")
	time_string = now.strftime("%I:%M %p")
	return f'{date_string} {time_string}'

	while True:
	# Fetch the RSS feed
	user_agent = "<company> <email>"
	feedparser.USER_AGENT = user_agent

	rss_feed_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=1000&output=atom"
	feed = feedparser.parse(rss_feed_url, request_headers={"User-Agent": user_agent})

	filings = []
	# Iterate over the entries in the RSS feed
	for entry in feed.entries:
	# Get the URL from the entry
	url = entry.link
	title = entry.title
	updated = entry.updated
	_dict = {
	"url": url
	, "title": title
	, "updated": updated
	, "docs": []
	}
	filings.append(_dict)
	# Make a request to the URL

	headers = {
	"User-Agent": user_agent,
	"Accept-Encoding": "gzip, deflate",
	"Host": "www.sec.gov"
	}

	# Create a tqdm progress bar
	progress_bar = tqdm(filings, desc="Processing filings")

	for filing in progress_bar:
	sleep(.5)
	response = requests.get(filing["url"], headers=headers)
	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content
	soup = BeautifulSoup(response.content, "html.parser")
	# Find all <a> tags with href attributes
	for a_tag in soup.find_all("a", href=True):
	href = a_tag["href"]
	# Check if the href has htm, xml, or pdf extension
	if href.endswith((".htm", ".xml", ".pdf", ".jpg", ".png")) and '/Archives/edgar/data/' in href:
	filing["docs"].append(href)
	# Update the progress bar
	progress_bar.set_postfix({"Docs Found": len(filing["docs"])})

	headers = {
	"User-Agent": user_agent,
	"Accept-Encoding": "gzip, deflate",
	"Host": "www.sec.gov"
	}

	folder = 'sec_filings'

	if not os.path.exists(folder):
	os.makedirs(folder)

	progress_bar = tqdm(filings, desc="Downloading filings")

	for item in progress_bar:
	docs = item['docs']
	title = item['title']

	for doc in docs:
	url = 'https://www.sec.gov' + doc
	file_name = title.replace("/","_") + '_' + doc.split('/')[-1]
	file_path = os.path.join(folder, file_name)

	response = requests.get(url, stream=True, headers=headers)
	response.raise_for_status()

	total_size = int(response.headers.get('content-length', 0))
	block_size = 1024 # 1KB

	with open(file_path, 'wb') as file:
	for data in response.iter_content(block_size):
	file.write(data)
	progress_bar.set_postfix({"Attachments found": len(docs)})
	progress_bar.close()

	fancy_message('info', f'Downloads completed @ {print_current_datetime()}')
	fancy_message('wait','sleeping for 2 minutes')
	sleep(120)