Luke-G-B/scrape_and_write.py

## scrape_and_write.py
''' In this example I will be scraping sec.gov for Form 8-K's using Beautiful Soup.
We will then store each row into a CSV file line-by-line. This can be useful
when scraping a large data set, preventing data from being lost when an error occurs.
'''
from bs4 import BeautifulSoup
import requests

# We will collect 100 Form 8-K's for Amazon using their ticker symbol and CIK code.
ticker = 'AMZN'
cik = '0001018724'
# Let's look for all 8-K's prior to 2018, and limit our results to 100
priorto = '20180101'
count = '100'

# Create a new file indicating your headers:
with open('data/my_file.csv', 'w') as f:
    f.write('ticker,filing_date,8k\n')

# base_url is a webpage with a list of Amazon's 8-K's. Turn it into soup:
base_url = 'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='\
    +str(cik)+'&type=8-K&dateb='\
    +str(priorto)+'&owner=exclude&output=xml&count='\
    +str(count)
r = requests.get(base_url)
data = r.text
soup = BeautifulSoup(data, 'lxml')

# Get the URL for each 8-K detail page using the 'filinghref' html tag:
detail_urls = soup.find_all('filinghref')

# Iterate through each URL:
for url in detail_urls:
    # If URL suffix is 'htm', change to 'html'
    url = url.string
    if url.split('.')[len(url.string.split('.')) - 1] == 'htm':
        url+='l'

    # Turn each URL into soup
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')

    # Locate the filing date from the detail page by searching by html tag
    filing_date = soup.find_all('div', {"class" : 'info'})[1].string

    # The suffix to the form 8-K can be found on this page too
    url_suffix = soup.find_all('tr')[1].find_all('td')[2].get_text()

    # Slice off the end of our url, and add the new suffix to get the Form 8-K url
    doc_url = url[0:-31] + url_suffix

    # Turn the Form 8-k url into soup
    r = requests.get(doc_url)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')

    # Finally, the text we're after!
    # Get rid of the commas, quotation marks, and any new lines characters
    text = soup.get_text().replace(',', '').replace('"', '').replace('\n', ' ')

    # Combine the data into a CSV style string and write to the file created before the for loop started
    write_string = ticker + ',' + filing_date + ',' + text + '\n'

    # the 'a' indicates that the file will be appended
    with open('data/my_file.csv', 'a') as f:
        f.write(write_string)
	''' In this example I will be scraping sec.gov for Form 8-K's using Beautiful Soup.
	We will then store each row into a CSV file line-by-line. This can be useful
	when scraping a large data set, preventing data from being lost when an error occurs.
	'''
	from bs4 import BeautifulSoup
	import requests

	# We will collect 100 Form 8-K's for Amazon using their ticker symbol and CIK code.
	ticker = 'AMZN'
	cik = '0001018724'
	# Let's look for all 8-K's prior to 2018, and limit our results to 100
	priorto = '20180101'
	count = '100'

	# Create a new file indicating your headers:
	with open('data/my_file.csv', 'w') as f:
	f.write('ticker,filing_date,8k\n')

	# base_url is a webpage with a list of Amazon's 8-K's. Turn it into soup:
	base_url = 'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='\
	+str(cik)+'&type=8-K&dateb='\
	+str(priorto)+'&owner=exclude&output=xml&count='\
	+str(count)
	r = requests.get(base_url)
	data = r.text
	soup = BeautifulSoup(data, 'lxml')

	# Get the URL for each 8-K detail page using the 'filinghref' html tag:
	detail_urls = soup.find_all('filinghref')

	# Iterate through each URL:
	for url in detail_urls:
	# If URL suffix is 'htm', change to 'html'
	url = url.string
	if url.split('.')[len(url.string.split('.')) - 1] == 'htm':
	url+='l'

	# Turn each URL into soup
	r = requests.get(url)
	data = r.text
	soup = BeautifulSoup(data, 'lxml')

	# Locate the filing date from the detail page by searching by html tag
	filing_date = soup.find_all('div', {"class" : 'info'})[1].string

	# The suffix to the form 8-K can be found on this page too
	url_suffix = soup.find_all('tr')[1].find_all('td')[2].get_text()

	# Slice off the end of our url, and add the new suffix to get the Form 8-K url
	doc_url = url[0:-31] + url_suffix

	# Turn the Form 8-k url into soup
	r = requests.get(doc_url)
	data = r.text
	soup = BeautifulSoup(data, 'lxml')

	# Finally, the text we're after!
	# Get rid of the commas, quotation marks, and any new lines characters
	text = soup.get_text().replace(',', '').replace('"', '').replace('\n', ' ')

	# Combine the data into a CSV style string and write to the file created before the for loop started
	write_string = ticker + ',' + filing_date + ',' + text + '\n'

	# the 'a' indicates that the file will be appended
	with open('data/my_file.csv', 'a') as f:
	f.write(write_string)