tommycarstensen/parseSEC.py

## parseSEC.py
#!/bin/env python3

#Tommy Carstensen, April 2018

import feedparser
import requests
import time
import os
import re
import smtplib
from email.message import EmailMessage
import base64

url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=100&output=atom'
url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=exclude&start=0&count=100&output=atom'

db = 'sec.db'
if os.path.isfile(db):
    with open(db) as f:
        if len(f.readlines()) > 500:
            os.remove(db)
else:
    with open(db, 'w'):
        pass

pattern = r'href=[\'"]?(/Archives/edgar/data/[^\'" >]+)'


def send_message(msgSubject, msgContent, msgTo='tommy.carstensen@gmail.com'):

    with open('.password') as f:
        password = base64.b64decode(f.read()).decode()

    msg = EmailMessage()
    msg['Subject'] = msgSubject
    msg['From'] = 'noreply@tommycarstensen.com'
    msg['To'] = msgTo
    msg.set_content(msgContent)

    server = smtplib.SMTP(host='send.one.com', port=587)
    server.starttls()
    server.login('noreply@tommycarstensen.com', password)
    server.send_message(msg)
    server.quit()

    return


def main():

    Mbytes_total = 0
    while True:

        rssPR = feedparser.parse(url)
        Mbytes_sub, sleep = loop_entries(rssPR.entries)
        Mbytes_total += Mbytes_sub
        print('Total MB downloaded:', Mbytes_total)
        if sleep > 0:
            print('sleep', sleep)
            time.sleep(sleep)

    return


def loop_entries(entries):

    Mbytes = 0
    nEntries = 0
    for entry in entries:

        form = re.split(' - ',entry.title)[0]
        try:
            assert form == entry.tags[0]['term']
        except:
            print(entry.tags, entry.title)
            exit()
        if skip_form(form) is True:
            continue

        ## Skip if entry was already looped over.
        if check_db(entry.id):
            continue
        nEntries += 1

        print(entry.title)
        print(entry.link)
        response = requests.get(entry.link)
        Mbytes += len(response.text) / (1024 ** 2)
        ## https://stackoverflow.com/questions/499345/regular-expression-to-extract-url-from-an-html-link
        ## https://pythonspot.com/extract-links-from-webpage-beautifulsoup/
        ## https://stackoverflow.com/questions/773340/can-you-provide-examples-of-parsing-html/773344#773344
        Mbytes += loop_documents(
            entry,
            re.findall(pattern, response.text),
            form,
            )

        ## Append to DB *after* reading all urls/documents of the entry/filing.
        with open(db, 'a') as f:
            print(entry.id, file=f)
        with open('forms', 'a') as f:
            print(form, file=f)

    if nEntries > 90:
        stop
    print('Mbytes downloaded:', Mbytes)
    ## Sleep if few entries and otherwise continue while loop.
    if nEntries == 0:
        sleep = 600
    elif nEntries < 5:
        sleep = 300
    elif nEntries < 20:
        sleep = 60
    else:
        sleep = 0

    return Mbytes, sleep


def loop_documents(entry, documents, form):

    Mbytes = 0
    for href in documents:
        if os.path.splitext(href)[1].lower() in (
            '.pdf', '.jpg', '.gif', '.paper', '.fil'):
            continue
        if os.path.splitext(href)[1].lower() not in (
            '.txt', '.xml', '.htm', '.xsd'):
            print(entry.link)
            print('https://www.sec.gov'+href)
            print('unknown extension')
            exit()
        if form == '13F-HR/A':
            if not os.path.splitext(href)[1].lower() in ('.xml'):
                continue
##                print('https://www.sec.gov'+href)
        response = requests.get('https://www.sec.gov'+href)
##                if 'ACQUI' in response.text.upper() and form == '': 425/SC14D
##                    print(
##                        entry.title,
##                        entry.link,
##                        response.text,
##                        sep='\n',
##                        )
        Mbytes += len(response.text) / (1024 ** 2)
        Continue, line = loop_lines(response)
        if Continue is True:
            continue
        send_message(
            entry.title,
            '\n\n'.join((
                entry.title,
                entry.link,
                'https://www.sec.gov'+href,
                line,
                response.text,
                ))
            )
        print(
            response.text,
            entry.title,
            entry.link,
            'https://www.sec.gov'+href,
            line,
            sep='\n\n',
            )

        ## Do not read additional documents associated with entry.
        return Mbytes

    return Mbytes


def loop_lines(response):

    for line in response.text.split('\n'):
        if 'TEVA PHARM' in line.upper() or '23ANDME' in line.upper() or 'AKORN' in line.upper():
            return False, line
    else:
        return True, None


def skip_form(form):

    ## https://www.sec.gov/forms
    ## https://www.sec.gov/info/edgar/forms/edgform.pdf
    ## https://en.wikipedia.org/wiki/SEC_filing#All_filing_types

    if form in (
        'N-Q',  # Quarterly Schedule of Portfolio Holdings of Registered Management Investment Company
        'NSAR-A',  # Semi-annual report of registered investment companies
        'NSAR-A/A',
        'NSAR-B',  # Semi-annual report of registered investment companies
        'DEF 14A',  # Definitive proxy statement
        '10-K',  # Annual reports (not current information)
        '10-Q',  # Quarterly reports (not current information)
        '6-K',  # Quarterly reports (not current information)
        ):
        return True
    ## Filed By Investment Advisers
    if form.startswith('ADV'):
        return True
    ## Filed By Municipal Advisors
    if form.startswith('MA'):
        return True
    ## Asset Backed Securities
    if form.startswith('ABS'):
        return True
    ## Prospectus
    if form.startswith('424'):
        return True
    ## Investment Companies
    if form.startswith('497K'):
        return True

    return False


def check_db(ID):

    with open(db) as f:
        for line in f:
            if line.rstrip() == ID:
                return True

    return False


if __name__ == '__main__':
    main()
	#!/bin/env python3

	#Tommy Carstensen, April 2018

	import feedparser
	import requests
	import time
	import os
	import re
	import smtplib
	from email.message import EmailMessage
	import base64

	url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=100&output=atom'
	url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=exclude&start=0&count=100&output=atom'

	db = 'sec.db'
	if os.path.isfile(db):
	with open(db) as f:
	if len(f.readlines()) > 500:
	os.remove(db)
	else:
	with open(db, 'w'):
	pass

	pattern = r'href=[\'"]?(/Archives/edgar/data/[^\'" >]+)'


	def send_message(msgSubject, msgContent, msgTo='tommy.carstensen@gmail.com'):

	with open('.password') as f:
	password = base64.b64decode(f.read()).decode()

	msg = EmailMessage()
	msg['Subject'] = msgSubject
	msg['From'] = 'noreply@tommycarstensen.com'
	msg['To'] = msgTo
	msg.set_content(msgContent)

	server = smtplib.SMTP(host='send.one.com', port=587)
	server.starttls()
	server.login('noreply@tommycarstensen.com', password)
	server.send_message(msg)
	server.quit()

	return


	def main():

	Mbytes_total = 0
	while True:

	rssPR = feedparser.parse(url)
	Mbytes_sub, sleep = loop_entries(rssPR.entries)
	Mbytes_total += Mbytes_sub
	print('Total MB downloaded:', Mbytes_total)
	if sleep > 0:
	print('sleep', sleep)
	time.sleep(sleep)

	return


	def loop_entries(entries):

	Mbytes = 0
	nEntries = 0
	for entry in entries:

	form = re.split(' - ',entry.title)[0]
	try:
	assert form == entry.tags[0]['term']
	except:
	print(entry.tags, entry.title)
	exit()
	if skip_form(form) is True:
	continue

	## Skip if entry was already looped over.
	if check_db(entry.id):
	continue
	nEntries += 1

	print(entry.title)
	print(entry.link)
	response = requests.get(entry.link)
	Mbytes += len(response.text) / (1024 ** 2)
	## https://stackoverflow.com/questions/499345/regular-expression-to-extract-url-from-an-html-link
	## https://pythonspot.com/extract-links-from-webpage-beautifulsoup/
	## https://stackoverflow.com/questions/773340/can-you-provide-examples-of-parsing-html/773344#773344
	Mbytes += loop_documents(
	entry,
	re.findall(pattern, response.text),
	form,
	)

	## Append to DB after reading all urls/documents of the entry/filing.
	with open(db, 'a') as f:
	print(entry.id, file=f)
	with open('forms', 'a') as f:
	print(form, file=f)

	if nEntries > 90:
	stop
	print('Mbytes downloaded:', Mbytes)
	## Sleep if few entries and otherwise continue while loop.
	if nEntries == 0:
	sleep = 600
	elif nEntries < 5:
	sleep = 300
	elif nEntries < 20:
	sleep = 60
	else:
	sleep = 0

	return Mbytes, sleep


	def loop_documents(entry, documents, form):

	Mbytes = 0
	for href in documents:
	if os.path.splitext(href)[1].lower() in (
	'.pdf', '.jpg', '.gif', '.paper', '.fil'):
	continue
	if os.path.splitext(href)[1].lower() not in (
	'.txt', '.xml', '.htm', '.xsd'):
	print(entry.link)
	print('https://www.sec.gov'+href)
	print('unknown extension')
	exit()
	if form == '13F-HR/A':
	if not os.path.splitext(href)[1].lower() in ('.xml'):
	continue
	## print('https://www.sec.gov'+href)
	response = requests.get('https://www.sec.gov'+href)
	## if 'ACQUI' in response.text.upper() and form == '': 425/SC14D
	## print(
	## entry.title,
	## entry.link,
	## response.text,
	## sep='\n',
	## )
	Mbytes += len(response.text) / (1024 ** 2)
	Continue, line = loop_lines(response)
	if Continue is True:
	continue
	send_message(
	entry.title,
	'\n\n'.join((
	entry.title,
	entry.link,
	'https://www.sec.gov'+href,
	line,
	response.text,
	))
	)
	print(
	response.text,
	entry.title,
	entry.link,
	'https://www.sec.gov'+href,
	line,
	sep='\n\n',
	)

	## Do not read additional documents associated with entry.
	return Mbytes

	return Mbytes


	def loop_lines(response):

	for line in response.text.split('\n'):
	if 'TEVA PHARM' in line.upper() or '23ANDME' in line.upper() or 'AKORN' in line.upper():
	return False, line
	else:
	return True, None


	def skip_form(form):

	## https://www.sec.gov/forms
	## https://www.sec.gov/info/edgar/forms/edgform.pdf
	## https://en.wikipedia.org/wiki/SEC_filing#All_filing_types

	if form in (
	'N-Q', # Quarterly Schedule of Portfolio Holdings of Registered Management Investment Company
	'NSAR-A', # Semi-annual report of registered investment companies
	'NSAR-A/A',
	'NSAR-B', # Semi-annual report of registered investment companies
	'DEF 14A', # Definitive proxy statement
	'10-K', # Annual reports (not current information)
	'10-Q', # Quarterly reports (not current information)
	'6-K', # Quarterly reports (not current information)
	):
	return True
	## Filed By Investment Advisers
	if form.startswith('ADV'):
	return True
	## Filed By Municipal Advisors
	if form.startswith('MA'):
	return True
	## Asset Backed Securities
	if form.startswith('ABS'):
	return True
	## Prospectus
	if form.startswith('424'):
	return True
	## Investment Companies
	if form.startswith('497K'):
	return True

	return False


	def check_db(ID):

	with open(db) as f:
	for line in f:
	if line.rstrip() == ID:
	return True

	return False


	if __name__ == '__main__':
	main()