Last active
April 15, 2020 22:37
-
-
Save Luke-G-B/bacbdeeb3c5502651fc6e84e5c50edb1 to your computer and use it in GitHub Desktop.
Write to a file as you web scrape sec.gov - useful for web scraping large sets of data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' In this example I will be scraping sec.gov for Form 8-K's using Beautiful Soup. | |
We will then store each row into a CSV file line-by-line. This can be useful | |
when scraping a large data set, preventing data from being lost when an error occurs. | |
''' | |
from bs4 import BeautifulSoup | |
import requests | |
# We will collect 100 Form 8-K's for Amazon using their ticker symbol and CIK code. | |
ticker = 'AMZN' | |
cik = '0001018724' | |
# Let's look for all 8-K's prior to 2018, and limit our results to 100 | |
priorto = '20180101' | |
count = '100' | |
# Create a new file indicating your headers: | |
with open('data/my_file.csv', 'w') as f: | |
f.write('ticker,filing_date,8k\n') | |
# base_url is a webpage with a list of Amazon's 8-K's. Turn it into soup: | |
base_url = 'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='\ | |
+str(cik)+'&type=8-K&dateb='\ | |
+str(priorto)+'&owner=exclude&output=xml&count='\ | |
+str(count) | |
r = requests.get(base_url) | |
data = r.text | |
soup = BeautifulSoup(data, 'lxml') | |
# Get the URL for each 8-K detail page using the 'filinghref' html tag: | |
detail_urls = soup.find_all('filinghref') | |
# Iterate through each URL: | |
for url in detail_urls: | |
# If URL suffix is 'htm', change to 'html' | |
url = url.string | |
if url.split('.')[len(url.string.split('.')) - 1] == 'htm': | |
url+='l' | |
# Turn each URL into soup | |
r = requests.get(url) | |
data = r.text | |
soup = BeautifulSoup(data, 'lxml') | |
# Locate the filing date from the detail page by searching by html tag | |
filing_date = soup.find_all('div', {"class" : 'info'})[1].string | |
# The suffix to the form 8-K can be found on this page too | |
url_suffix = soup.find_all('tr')[1].find_all('td')[2].get_text() | |
# Slice off the end of our url, and add the new suffix to get the Form 8-K url | |
doc_url = url[0:-31] + url_suffix | |
# Turn the Form 8-k url into soup | |
r = requests.get(doc_url) | |
data = r.text | |
soup = BeautifulSoup(data, 'lxml') | |
# Finally, the text we're after! | |
# Get rid of the commas, quotation marks, and any new lines characters | |
text = soup.get_text().replace(',', '').replace('"', '').replace('\n', ' ') | |
# Combine the data into a CSV style string and write to the file created before the for loop started | |
write_string = ticker + ',' + filing_date + ',' + text + '\n' | |
# the 'a' indicates that the file will be appended | |
with open('data/my_file.csv', 'a') as f: | |
f.write(write_string) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello
How are you?
thanks for the script
when i had run it it had got his error
line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x92' in position 696: character maps to