Skip to content

Instantly share code, notes, and snippets.

@tommycarstensen
Created April 3, 2018 18:25
Show Gist options
  • Save tommycarstensen/f0c413548d49f355a6c671bce43dc764 to your computer and use it in GitHub Desktop.
Save tommycarstensen/f0c413548d49f355a6c671bce43dc764 to your computer and use it in GitHub Desktop.
Continuously check selected SEC forms for selected key words.
#!/bin/env python3
#Tommy Carstensen, April 2018
import feedparser
import requests
import time
import os
import re
import smtplib
from email.message import EmailMessage
import base64
url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=100&output=atom'
url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=exclude&start=0&count=100&output=atom'
db = 'sec.db'
if os.path.isfile(db):
with open(db) as f:
if len(f.readlines()) > 500:
os.remove(db)
else:
with open(db, 'w'):
pass
pattern = r'href=[\'"]?(/Archives/edgar/data/[^\'" >]+)'
def send_message(msgSubject, msgContent, msgTo='tommy.carstensen@gmail.com'):
with open('.password') as f:
password = base64.b64decode(f.read()).decode()
msg = EmailMessage()
msg['Subject'] = msgSubject
msg['From'] = 'noreply@tommycarstensen.com'
msg['To'] = msgTo
msg.set_content(msgContent)
server = smtplib.SMTP(host='send.one.com', port=587)
server.starttls()
server.login('noreply@tommycarstensen.com', password)
server.send_message(msg)
server.quit()
return
def main():
Mbytes_total = 0
while True:
rssPR = feedparser.parse(url)
Mbytes_sub, sleep = loop_entries(rssPR.entries)
Mbytes_total += Mbytes_sub
print('Total MB downloaded:', Mbytes_total)
if sleep > 0:
print('sleep', sleep)
time.sleep(sleep)
return
def loop_entries(entries):
Mbytes = 0
nEntries = 0
for entry in entries:
form = re.split(' - ',entry.title)[0]
try:
assert form == entry.tags[0]['term']
except:
print(entry.tags, entry.title)
exit()
if skip_form(form) is True:
continue
## Skip if entry was already looped over.
if check_db(entry.id):
continue
nEntries += 1
print(entry.title)
print(entry.link)
response = requests.get(entry.link)
Mbytes += len(response.text) / (1024 ** 2)
## https://stackoverflow.com/questions/499345/regular-expression-to-extract-url-from-an-html-link
## https://pythonspot.com/extract-links-from-webpage-beautifulsoup/
## https://stackoverflow.com/questions/773340/can-you-provide-examples-of-parsing-html/773344#773344
Mbytes += loop_documents(
entry,
re.findall(pattern, response.text),
form,
)
## Append to DB *after* reading all urls/documents of the entry/filing.
with open(db, 'a') as f:
print(entry.id, file=f)
with open('forms', 'a') as f:
print(form, file=f)
if nEntries > 90:
stop
print('Mbytes downloaded:', Mbytes)
## Sleep if few entries and otherwise continue while loop.
if nEntries == 0:
sleep = 600
elif nEntries < 5:
sleep = 300
elif nEntries < 20:
sleep = 60
else:
sleep = 0
return Mbytes, sleep
def loop_documents(entry, documents, form):
Mbytes = 0
for href in documents:
if os.path.splitext(href)[1].lower() in (
'.pdf', '.jpg', '.gif', '.paper', '.fil'):
continue
if os.path.splitext(href)[1].lower() not in (
'.txt', '.xml', '.htm', '.xsd'):
print(entry.link)
print('https://www.sec.gov'+href)
print('unknown extension')
exit()
if form == '13F-HR/A':
if not os.path.splitext(href)[1].lower() in ('.xml'):
continue
## print('https://www.sec.gov'+href)
response = requests.get('https://www.sec.gov'+href)
## if 'ACQUI' in response.text.upper() and form == '': 425/SC14D
## print(
## entry.title,
## entry.link,
## response.text,
## sep='\n',
## )
Mbytes += len(response.text) / (1024 ** 2)
Continue, line = loop_lines(response)
if Continue is True:
continue
send_message(
entry.title,
'\n\n'.join((
entry.title,
entry.link,
'https://www.sec.gov'+href,
line,
response.text,
))
)
print(
response.text,
entry.title,
entry.link,
'https://www.sec.gov'+href,
line,
sep='\n\n',
)
## Do not read additional documents associated with entry.
return Mbytes
return Mbytes
def loop_lines(response):
for line in response.text.split('\n'):
if 'TEVA PHARM' in line.upper() or '23ANDME' in line.upper() or 'AKORN' in line.upper():
return False, line
else:
return True, None
def skip_form(form):
## https://www.sec.gov/forms
## https://www.sec.gov/info/edgar/forms/edgform.pdf
## https://en.wikipedia.org/wiki/SEC_filing#All_filing_types
if form in (
'N-Q', # Quarterly Schedule of Portfolio Holdings of Registered Management Investment Company
'NSAR-A', # Semi-annual report of registered investment companies
'NSAR-A/A',
'NSAR-B', # Semi-annual report of registered investment companies
'DEF 14A', # Definitive proxy statement
'10-K', # Annual reports (not current information)
'10-Q', # Quarterly reports (not current information)
'6-K', # Quarterly reports (not current information)
):
return True
## Filed By Investment Advisers
if form.startswith('ADV'):
return True
## Filed By Municipal Advisors
if form.startswith('MA'):
return True
## Asset Backed Securities
if form.startswith('ABS'):
return True
## Prospectus
if form.startswith('424'):
return True
## Investment Companies
if form.startswith('497K'):
return True
return False
def check_db(ID):
with open(db) as f:
for line in f:
if line.rstrip() == ID:
return True
return False
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment