Created
April 16, 2015 23:59
-
-
Save bbzzzz/3fab55c42a308e41ae7d to your computer and use it in GitHub Desktop.
Webscrape all XBRL files given stock ticker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from bs4 import BeautifulSoup as BeautifulSoup | |
def get_list(ticker): | |
base_url_part1 = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" | |
base_url_part2 = "&type=&dateb=&owner=&start=" | |
base_url_part3 = "&count=100&output=xml" | |
href = [] | |
for page_number in range(0,2000,100): | |
base_url = base_url_part1 + ticker + base_url_part2 + str(page_number) + base_url_part3 | |
sec_page = urllib2.urlopen(base_url) | |
sec_soup = BeautifulSoup(sec_page) | |
filings = sec_soup.findAll('filing') | |
for filing in filings: | |
report_year = int(filing.datefiled.get_text()[0:4]) | |
if (filing.type.get_text() == "10-K") & (report_year > 2008): | |
print filing.filinghref.get_text() | |
href.append(filing.filinghref.get_text()) | |
return href | |
url_list= get_list("aapl") | |
def download_report(url_list): | |
target_base_url = 'http://www.sec.gov' | |
target_file_name = u'XBRL INSTANCE DOCUMENT' | |
for report_url in url_list: | |
report_page = urllib2.urlopen(report_url) | |
report_soup = BeautifulSoup(report_page) | |
xbrl_file = report_soup.findAll('tr') | |
for item in xbrl_file: | |
try: | |
if item.findAll('td')[1].get_text() == target_file_name: | |
target_url = target_base_url + item.findAll('td')[2].find('a')['href'] | |
print "Target URL found!" | |
print "Target URL is:", target_url | |
file_name = target_url.split('/')[-1] | |
print file_name | |
xbrl_report = urllib2.urlopen(target_url) | |
output = open(file_name,'wb') | |
output.write(xbrl_report.read()) | |
output.close() | |
except: | |
pass | |
download_report(url_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment