Skip to content

Instantly share code, notes, and snippets.

@yash201040
Last active January 15, 2024 19:23
Show Gist options
  • Save yash201040/e35d5e65b9797f19c06600226b85e4ee to your computer and use it in GitHub Desktop.
Save yash201040/e35d5e65b9797f19c06600226b85e4ee to your computer and use it in GitHub Desktop.
Get Nasdaq100 stocks trade data for trailing 2 years from polygon.io
from bs4 import BeautifulSoup
from datetime import timedelta
import pandas as pd
import pandas_market_calendars as mcal
import requests
import time
# ----------------------------------------------
# Scrape NASDAQ 100 stock ticker names from wiki
# ----------------------------------------------
def get_nasdaq_100_tickers():
# Create soup object of the target html page
url = 'https://en.wikipedia.org/wiki/Nasdaq-100'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
# Find the table of rows containing NASDAQ 100 constituents
table = soup.find('table', {'id': 'constituents'})
rows = table.tbody.find_all('tr')
# Create a list to store tickers
tickers = []
for row in rows[1:]: # Skip the header row
# Access the ticker from row data (in 2nd column)
tds = row.find_all('td')[1]
# Append ticker to list
tickers.append(tds.text.strip())
# Remove class-A ticker for Alphabet as class-C (GOOG) already exists
tickers.remove('GOOGL')
return tickers
# Get NASDAQ 100 tickers
tickers = get_nasdaq_100_tickers()
# Print number of tickers collected
print(f'Tickers Found: {len(tickers)}') ## Tickers Found: 100
# -------------------------------------------
# Get valid trading days for trailing 2 years
# -------------------------------------------
# Get all calendar days for New York Stock Exchange (NYSE)
nyse = mcal.get_calendar('NYSE')
# Set the latest end date in US Eastern Time Zone (ET)
end_date = pd.Timestamp.now(tz='US/Eastern')
# Set start date 2 years behind the end date in the same timezone
start_date = (end_date - pd.DateOffset(years=2)).tz_convert('US/Eastern')
# Get valid trading days for the last 2 years but don't specify a timezone here
trading_days = nyse.valid_days(start_date=start_date.tz_localize(None), end_date=end_date.tz_localize(None))
# Then convert the timezone-aware result to 'US/Eastern'
trading_days = trading_days.tz_convert('US/Eastern')
# Print the formatted number of trading days of trailing 2 years
print(f'No. of trading days in trailing 2 years: {len(trading_days)}')
# -------------------------------
# Define a function to fetch data
# -------------------------------
# Function to fetch data for a ticker within given date range
def fetch_data(ticker, start_date, end_date):
url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{start_date}/{end_date}"
params = {"sort": "asc", "limit": 50000}
headers = {"Authorization": "Bearer gwetlM945TUXEqGt_peMizmmLyg3U6fS"}
response = requests.get(url, params=params, headers=headers)
data = response.json()['results']
df = pd.DataFrame(data)
return df
# --------------------------------------------------
# Get data for each ticker and save it to a csv file
# --------------------------------------------------
# Initialize variables to track progress
start_time = time.time()
total_api_calls = 0
for t in range(100):
# Select a ticker
ticker = tickers[t]
# Initialize an empty DataFrame for the ticker
df_all = pd.DataFrame()
# Loop through trading days with a window of 52 days because the aggregates max limit is 50,000 data points per api request
for day in range(0, len(trading_days), 52):
# Select date range for a span of 52 days
start_date = trading_days[day].date().isoformat()
end_date = trading_days[min(day + 52, len(trading_days) - 1)].date().isoformat()
# Fetch data for a ticker in that date range
df = fetch_data(ticker, start_date, end_date)
# Increment total api calls by 1
total_api_calls += 1
# Append the data in the ticker's data frame
df_all = pd.concat([df_all, df])
# Sleep for 12 seconds to stay within API rate limits (5 requests per minute)
time.sleep(12)
# Print progress update
elapsed_time = time.time() - start_time
print(f"\rProcessing ticker {t+1}/100: {ticker} || API Calls: {total_api_calls}/1000 || Elapsed Time: {str(timedelta(seconds=int(elapsed_time)))}", end="")
# Write ticker data to CSV
df_all.to_csv(f"{ticker}.csv", index=False)
print(f'Trailing 2 years trade data was successfully fetched for {total_api_calls//10} out of 100 stocks and total api calls made were: {total_api_calls}')
## OUTPUT: Processing ticker 100/100: ZS || API Calls: 960/1000 || Elapsed Time: 7:43:31
## Trailing 2 years trade data was successfully fetched for 96 out of 100 stocks and total api calls made were: 960
# ----------------------------------------------------- E N D -----------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment