bbzzzz/download_report

## download_report
import urllib2
from bs4 import BeautifulSoup as BeautifulSoup

def get_list(ticker):

    base_url_part1 = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="
    base_url_part2 = "&type=&dateb=&owner=&start="
    base_url_part3 = "&count=100&output=xml"
    href = []

    for page_number in range(0,2000,100):

        base_url = base_url_part1 + ticker + base_url_part2 + str(page_number) + base_url_part3

        sec_page = urllib2.urlopen(base_url)
        sec_soup = BeautifulSoup(sec_page)

        filings = sec_soup.findAll('filing')

        for filing in filings:
            report_year = int(filing.datefiled.get_text()[0:4])
            if (filing.type.get_text() == "10-K") & (report_year > 2008):
                print filing.filinghref.get_text()
                href.append(filing.filinghref.get_text())

    return href

url_list= get_list("aapl")

def download_report(url_list):

    target_base_url = 'http://www.sec.gov'
    target_file_name = u'XBRL INSTANCE DOCUMENT'

    for report_url in url_list:
        report_page = urllib2.urlopen(report_url)
        report_soup = BeautifulSoup(report_page)

        xbrl_file = report_soup.findAll('tr')

        for item in xbrl_file:
            try:
                if item.findAll('td')[1].get_text() == target_file_name:
                    target_url = target_base_url + item.findAll('td')[2].find('a')['href']
                    print "Target URL found!"
                    print "Target URL is:", target_url

                    file_name = target_url.split('/')[-1]
                    print file_name

                    xbrl_report = urllib2.urlopen(target_url)
                    output = open(file_name,'wb')
                    output.write(xbrl_report.read())
                    output.close()
            except:
                pass

download_report(url_list)
	import urllib2
	from bs4 import BeautifulSoup as BeautifulSoup

	def get_list(ticker):

	base_url_part1 = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="
	base_url_part2 = "&type=&dateb=&owner=&start="
	base_url_part3 = "&count=100&output=xml"
	href = []

	for page_number in range(0,2000,100):

	base_url = base_url_part1 + ticker + base_url_part2 + str(page_number) + base_url_part3

	sec_page = urllib2.urlopen(base_url)
	sec_soup = BeautifulSoup(sec_page)

	filings = sec_soup.findAll('filing')

	for filing in filings:
	report_year = int(filing.datefiled.get_text()[0:4])
	if (filing.type.get_text() == "10-K") & (report_year > 2008):
	print filing.filinghref.get_text()
	href.append(filing.filinghref.get_text())

	return href

	url_list= get_list("aapl")

	def download_report(url_list):

	target_base_url = 'http://www.sec.gov'
	target_file_name = u'XBRL INSTANCE DOCUMENT'

	for report_url in url_list:
	report_page = urllib2.urlopen(report_url)
	report_soup = BeautifulSoup(report_page)

	xbrl_file = report_soup.findAll('tr')

	for item in xbrl_file:
	try:
	if item.findAll('td')[1].get_text() == target_file_name:
	target_url = target_base_url + item.findAll('td')[2].find('a')['href']
	print "Target URL found!"
	print "Target URL is:", target_url

	file_name = target_url.split('/')[-1]
	print file_name

	xbrl_report = urllib2.urlopen(target_url)
	output = open(file_name,'wb')
	output.write(xbrl_report.read())
	output.close()
	except:
	pass

	download_report(url_list)