Skip to content

Instantly share code, notes, and snippets.

@bbzzzz
Created April 16, 2015 23:59
Show Gist options
  • Save bbzzzz/3fab55c42a308e41ae7d to your computer and use it in GitHub Desktop.
Save bbzzzz/3fab55c42a308e41ae7d to your computer and use it in GitHub Desktop.
Webscrape all XBRL files given stock ticker
import urllib2
from bs4 import BeautifulSoup as BeautifulSoup
def get_list(ticker):
base_url_part1 = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="
base_url_part2 = "&type=&dateb=&owner=&start="
base_url_part3 = "&count=100&output=xml"
href = []
for page_number in range(0,2000,100):
base_url = base_url_part1 + ticker + base_url_part2 + str(page_number) + base_url_part3
sec_page = urllib2.urlopen(base_url)
sec_soup = BeautifulSoup(sec_page)
filings = sec_soup.findAll('filing')
for filing in filings:
report_year = int(filing.datefiled.get_text()[0:4])
if (filing.type.get_text() == "10-K") & (report_year > 2008):
print filing.filinghref.get_text()
href.append(filing.filinghref.get_text())
return href
url_list= get_list("aapl")
def download_report(url_list):
target_base_url = 'http://www.sec.gov'
target_file_name = u'XBRL INSTANCE DOCUMENT'
for report_url in url_list:
report_page = urllib2.urlopen(report_url)
report_soup = BeautifulSoup(report_page)
xbrl_file = report_soup.findAll('tr')
for item in xbrl_file:
try:
if item.findAll('td')[1].get_text() == target_file_name:
target_url = target_base_url + item.findAll('td')[2].find('a')['href']
print "Target URL found!"
print "Target URL is:", target_url
file_name = target_url.split('/')[-1]
print file_name
xbrl_report = urllib2.urlopen(target_url)
output = open(file_name,'wb')
output.write(xbrl_report.read())
output.close()
except:
pass
download_report(url_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment