Skip to content

Instantly share code, notes, and snippets.

@buswedg
Created April 8, 2020 03:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save buswedg/98f18435171ec7c9074808820408bd9c to your computer and use it in GitHub Desktop.
Save buswedg/98f18435171ec7c9074808820408bd9c to your computer and use it in GitHub Desktop.
pulling_free_us_stock_market_data\data_downloader
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"import logging\n",
"\n",
"import datetime as dt\n",
"from datetime import date\n",
"\n",
"import string\n",
"import re\n",
"\n",
"import pandas as pd\n",
"\n",
"import yfinance as yf # will likely need to pip install\n",
"# note an issue with yfinance here: github.com/ranaroussi/yfinance/issues/214\n",
"from yahoofinancials import YahooFinancials # will likely need to pip install\n",
"# note an issue with YahooFinancials here: https://github.com/sylvandb/yahoofinancials/commit/9b5133d8c2b3d0fb7ca7cd76816b2049c38f7005#diff-859f0dd652dec0f5114399943a987467"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def check_or_create_dir(directory):\n",
"\n",
" if not check_exists(directory):\n",
" os.makedirs(directory)\n",
" \n",
" \n",
"def check_exists(path):\n",
"\n",
" return os.path.exists(path)\n",
"\n",
"\n",
"def get_latest_file(dir):\n",
" ls_files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]\n",
" ls_files.sort(key=lambda x: dt.datetime.strptime(x.split('_')[0], '%Y%m%d'))\n",
"\n",
" latest_file = ls_files[0] if len(ls_files) > 0 else None\n",
"\n",
" return latest_file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def update_csv(df, dic_dir, dic_args):\n",
" data_path = os.path.join(dic_dir['data_dir'], dic_args['entity_symbol'] + \".csv.gz\")\n",
"\n",
" if check_exists(data_path):\n",
" df_csv = pd.read_csv(data_path, header=0, sep=';', quotechar='\"', compression='gzip')\n",
"\n",
" df = df_csv.append(df, sort=False)\n",
" df = df.drop_duplicates(subset='pr_key', keep='last')\n",
"\n",
" df.to_csv(data_path, sep=';', quotechar='\"', date_format='%Y-%m-%d %H:%M:%S.%f', compression='gzip', index=False)\n",
"\n",
"\n",
"def open_tracker(tracker_path):\n",
" df_tracker = pd.read_csv(tracker_path, header=0, sep=';', quotechar='\"')\n",
"\n",
" df_tracker['last_ticker_pull'] = pd.to_datetime(df_tracker['last_ticker_pull'], format='%Y-%m-%d %H:%M:%S.%f')\n",
" df_tracker['last_indicator_calc'] = pd.to_datetime(df_tracker['last_indicator_calc'], format='%Y-%m-%d %H:%M:%S.%f')\n",
"\n",
" return df_tracker\n",
"\n",
"\n",
"def download_yfinance_prices(dic_dir, dic_args, update_time):\n",
"\n",
" dic_args['entity_query'] = dic_args['entity_symbol']\n",
"\n",
" ls_table_types = ['eod', 'intra']\n",
"\n",
" try:\n",
" ticker = yf.Ticker(dic_args['entity_query'])\n",
" df_eod = ticker.history(period='max', auto_adjust=False, rounding=False)\n",
"\n",
" except Exception:\n",
" print('ERROR: yfinance class call issue')\n",
" raise\n",
"\n",
" ls_periods = ['1mo', '5d', '1d']\n",
"\n",
" p_index = 0\n",
" df_intra = pd.DataFrame([])\n",
"\n",
" while (p_index < len(ls_periods)) and df_intra.empty:\n",
" try:\n",
" df_intra = ticker.history(period=ls_periods[p_index], interval='5m', auto_adjust=False, rounding=False)\n",
"\n",
" except Exception:\n",
" print('ERROR: yfinance class call issue')\n",
" pass\n",
"\n",
" p_index += 1\n",
"\n",
" dic_df_ref = {'eod': df_eod,\n",
" 'intra': df_intra}\n",
"\n",
" for table_type in ls_table_types:\n",
" dic_args['table_type'] = table_type\n",
"\n",
" df = dic_df_ref[dic_args['table_type']]\n",
"\n",
" if df.empty:\n",
" print('ERROR: empty dataframe')\n",
" continue\n",
"\n",
" dic_cols_rename = dic_cols_rename_ref[dic_args['table_type']]\n",
"\n",
" if not set(df.columns).issubset(dic_cols_rename.keys()):\n",
" print('WARNING: unknown columns encountered')\n",
"\n",
" df = df.rename(columns=dic_cols_rename)\n",
"\n",
" df = df.rename_axis('date').reset_index()\n",
"\n",
" for param in ['entity_symbol', 'region', 'composite_symbol', 'data_subcat', 'table_type', 'data_source']:\n",
" df[param] = dic_args[param]\n",
"\n",
" df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')\n",
" df['pr_key'] = df['entity_symbol'] + '.' + df['composite_symbol'] + '_' + df['table_type'] + '_' + df['date'].apply(str)\n",
" df['last_updated'] = update_time.strftime('%Y-%m-%d %H:%M:%S.%f')\n",
"\n",
" ls_cols = dic_ls_cols_ref[dic_args['data_source']][dic_args['table_type']]\n",
" df = pd.DataFrame(columns=ls_cols).append(df)\n",
"\n",
" dic_dir['data_dir'] = os.path.join(dic_dir['base'], dic_args['data_dir'])\n",
"\n",
" for param in re.findall(r'\\{(.*?)\\}', dic_dir['data_dir']):\n",
" dic_dir['data_dir'] = dic_dir['data_dir'].replace('{'+param+'}', dic_args[param])\n",
"\n",
" check_or_create_dir(dic_dir['data_dir'])\n",
"\n",
" update_csv(df, dic_dir, dic_args)\n",
"\n",
"\n",
"def download_yfinance_fundamentals(dic_dir, dic_args, update_time):\n",
"\n",
" dic_args['entity_query'] = dic_args['entity_symbol']\n",
"\n",
" ls_table_types = ['earnings', 'balance_sheet', 'cashflow', 'financials']\n",
"\n",
" try:\n",
" ticker = yf.Ticker(dic_args['entity_query'])\n",
"\n",
" dic_df_a_ref = {'earnings': ticker.earnings,\n",
" 'balance_sheet': ticker.balance_sheet,\n",
" 'cashflow': ticker.cashflow,\n",
" 'financials': ticker.financials}\n",
"\n",
" dic_df_q_ref = {'earnings': ticker.quarterly_earnings,\n",
" 'balance_sheet': ticker.quarterly_balance_sheet,\n",
" 'cashflow': ticker.quarterly_cashflow,\n",
" 'financials': ticker.quarterly_financials}\n",
"\n",
" except Exception:\n",
" print('ERROR: yfinance class call issue')\n",
" raise\n",
"\n",
" for table_type in ls_table_types:\n",
" dic_args['table_type'] = table_type\n",
"\n",
" df_a = dic_df_a_ref[table_type]\n",
" df_q = dic_df_q_ref[table_type]\n",
"\n",
" if df_a.empty and df_q.empty:\n",
" print('ERROR: empty dataframe')\n",
" continue\n",
"\n",
" if table_type != 'earnings': # don't transpose earnings\n",
" df_a = df_a.T\n",
" df_q = df_q.T\n",
"\n",
" df_a['statement_period'] = 'annual'\n",
" df_q['statement_period'] = 'quarterly'\n",
"\n",
" df = df_q.append(df_a)\n",
"\n",
" dic_cols_rename = dic_cols_rename_ref[dic_args['table_type']]\n",
"\n",
" if not set([c for c in list(df.columns) if c != 'statement_period']).issubset(dic_cols_rename.keys()):\n",
" print('WARNING: unknown columns encountered')\n",
"\n",
" df = df.rename(columns=dic_cols_rename)\n",
"\n",
" df = df.rename_axis('date').reset_index()\n",
"\n",
" for param in ['entity_symbol', 'region', 'composite_symbol', 'data_subcat', 'table_type', 'data_source']:\n",
" df[param] = dic_args[param]\n",
"\n",
" if table_type != 'earnings': # earnings uses cat. dt\n",
" df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')\n",
"\n",
" df['pr_key'] = df['entity_symbol'] + '.' + df['composite_symbol'] + '_' + df['table_type'] + '_' + df['date'].apply(str)\n",
" df['last_updated'] = update_time.strftime('%Y-%m-%d %H:%M:%S.%f')\n",
"\n",
" ls_cols = dic_ls_cols_ref[dic_args['data_source']][dic_args['table_type']]\n",
" df = pd.DataFrame(columns=ls_cols).append(df)\n",
"\n",
" dic_dir['data_dir'] = os.path.join(dic_dir['base'], dic_args['data_dir'])\n",
"\n",
" for param in re.findall(r'\\{(.*?)\\}', dic_dir['data_dir']):\n",
" dic_dir['data_dir'] = dic_dir['data_dir'].replace('{'+param+'}', dic_args[param])\n",
"\n",
" check_or_create_dir(dic_dir['data_dir'])\n",
"\n",
" update_csv(df, dic_dir, dic_args)\n",
"\n",
"\n",
"def download_yahoofinance_fundamentals(dic_dir, dic_args, update_time):\n",
" ls_table_types = ['key_stats']\n",
"\n",
" try:\n",
" yahoo_financials = YahooFinancials(dic_args['entity_query'])\n",
" ticker = yahoo_financials.get_key_statistics_data()\n",
"\n",
" except Exception:\n",
" print('ERROR: yahoofinancials class call issue')\n",
" raise\n",
"\n",
" df_key_stats = pd.DataFrame(ticker).T\n",
"\n",
" dic_df_ref = {'key_stats': df_key_stats}\n",
"\n",
" for table_type in ls_table_types:\n",
" dic_args['table_type'] = table_type\n",
"\n",
" df = dic_df_ref[table_type]\n",
"\n",
" if df.empty:\n",
" print('ERROR: empty dataframe')\n",
" continue\n",
"\n",
" dic_cols_rename = dic_cols_rename_ref[dic_args['table_type']]\n",
"\n",
" if not set(df.columns).issubset(dic_cols_rename.keys()):\n",
" print('WARNING: unknown columns encountered')\n",
"\n",
" df = df.rename(columns=dic_cols_rename)\n",
"\n",
" df = df.rename_axis('date').reset_index()\n",
"\n",
" for param in ['entity_symbol', 'region', 'composite_symbol', 'data_subcat', 'table_type', 'data_source']:\n",
" df[param] = dic_args[param]\n",
"\n",
" df['date'] = update_time.strftime('%Y-%m-%d %H:%M:%S.%f')\n",
" df['pr_key'] = df['entity_symbol'] + '.' + df['composite_symbol'] + '_' + df['table_type'] + '_' + df['date'].apply(str)\n",
" df['last_updated'] = update_time.strftime('%Y-%m-%d %H:%M:%S.%f')\n",
"\n",
" ls_cols = dic_ls_cols_ref[dic_args['data_source']][dic_args['table_type']]\n",
" df = pd.DataFrame(columns=ls_cols).append(df)\n",
"\n",
" dic_dir['data_dir'] = os.path.join(dic_dir['base'], dic_args['data_dir'])\n",
"\n",
" for param in re.findall(r'\\{(.*?)\\}', dic_dir['data_dir']):\n",
" dic_dir['data_dir'] = dic_dir['data_dir'].replace('{'+param+'}', dic_args[param])\n",
"\n",
" check_or_create_dir(dic_dir['data_dir'])\n",
"\n",
" update_csv(df, dic_dir, dic_args)\n",
"\n",
"\n",
"def try_download_data(dic_dir, dic_args, update_time, tries_count=1):\n",
" max_tries = 1 # maximum download attempts\n",
" sleep_time = 0 # sleep time between download attempts\n",
"\n",
" ############################################################\n",
"\n",
" downloaded = False\n",
"\n",
" log_str = \"download {} data for {}\".format(dic_args['data_type'], dic_args['entity_symbol'])\n",
"\n",
" try:\n",
" print(\"\\n\" + \"ATTEMPT: \" + log_str)\n",
" logging.info(\"\\n\" + \"ATTEMPT: \" + log_str)\n",
"\n",
" if (dic_args['data_source'] == 'yfinance') and (dic_args['data_type'] == 'price_data'):\n",
" download_yfinance_prices(dic_dir, dic_args, update_time)\n",
"\n",
" elif (dic_args['data_source'] == 'yfinance') and (dic_args['data_type'] == 'fund_data'):\n",
" download_yfinance_fundamentals(dic_dir, dic_args, update_time)\n",
"\n",
" elif (dic_args['data_source'] == 'yahoofinance') and (dic_args['data_type'] == 'fund_data'):\n",
" download_yahoofinance_fundamentals(dic_dir, dic_args, update_time)\n",
"\n",
" print(\"SUCCESS: \" + log_str)\n",
" logging.info(\"SUCCESS: \" + log_str)\n",
"\n",
" downloaded = True\n",
"\n",
" except Exception as e:\n",
" print(str(e))\n",
" logging.error(str(e))\n",
"\n",
" print(\"FAILED: \" + log_str)\n",
" logging.error(\"FAILED: \" + log_str)\n",
"\n",
" if tries_count < max_tries:\n",
" time.sleep(sleep_time)\n",
"\n",
" downloaded = try_download_data(dic_dir, dic_args, update_time, tries_count=tries_count+1)\n",
"\n",
" return downloaded\n",
"\n",
" return downloaded\n",
"\n",
"\n",
"def pull_data(dic_dir, dic_args):\n",
" dic_dir['tracker_dir'] = os.path.join(dic_dir['base'], dic_args['tracker_dir'])\n",
" entity_tracker_file = get_latest_file(dic_dir['tracker_dir'])\n",
"\n",
" if entity_tracker_file is not None:\n",
" entity_tracker_path = os.path.join(dic_dir['tracker_dir'], entity_tracker_file)\n",
" df_entity_tracker = open_tracker(entity_tracker_path)\n",
"\n",
" else:\n",
" raise CustomException\n",
"\n",
" for index, row in df_entity_tracker.iterrows():\n",
"\n",
" if (dic_args['index_filter'] is None) or (row[dic_args['index_filter']] == \"Y\"):\n",
" update_time = dt.datetime.now()\n",
"\n",
" if update_time > row['last_ticker_pull']:\n",
" dic_args['entity_symbol'] = row['entity_symbol']\n",
" dic_args['composite_symbol'] = row['composite_symbol']\n",
"\n",
" if try_download_data(dic_dir, dic_args, update_time):\n",
" df_entity_tracker.loc[index, 'last_ticker_pull'] = update_time\n",
"\n",
" df_entity_tracker.to_csv(entity_tracker_path, sep=';', quotechar='\"', date_format='%Y-%m-%d %H:%M:%S.%f', index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dic_cols_rename_price = {\n",
" 'Date': 'date',\n",
" 'Symbol': 'entity_symbol',\n",
" 'Open': 'open',\n",
" 'High': 'high',\n",
" 'Low': 'low',\n",
" 'Close': 'close',\n",
" 'Adj Close': 'adj_close',\n",
" 'Volume': 'volume',\n",
" 'Dividends': 'dividend',\n",
" 'Stock Splits': 'split'\n",
"}\n",
"\n",
"\n",
"dic_cols_rename_balance_sheet = {\n",
" 'Intangible Assets': 'intangible_assets',\n",
" 'Capital Surplus': 'capital_surplus',\n",
" 'Total Liab': 'total_liabilities',\n",
" 'Total Stockholder Equity': 'total_stockholder_equity',\n",
" 'Other Current Liab': 'other_current_liabilities',\n",
" 'Total Assets': 'total_assets',\n",
" 'Common Stock': 'common_stock',\n",
" 'Other Current Assets': 'other_current_assets',\n",
" 'Retained Earnings': 'retained_earnings',\n",
" 'Other Liab': 'other_liabilities',\n",
" 'Good Will': 'good_will',\n",
" 'Treasury Stock': 'treasury_stock',\n",
" 'Other Assets': 'other_assets',\n",
" 'Cash': 'cash',\n",
" 'Total Current Liabilities': 'total_current_liabilities',\n",
" 'Other Stockholder Equity': 'other_stockholder_equity',\n",
" 'Property Plant Equipment': 'property_plant_equipment',\n",
" 'Total Current Assets': 'total_current_assets',\n",
" 'Short Term Investments': 'short_term_investments',\n",
" 'Long Term Investments': 'long_term_investments',\n",
" 'Net Tangible Assets': 'net_tangible_assets',\n",
" 'Net Receivables': 'net_receivables',\n",
" 'Long Term Debt': 'long_term_debt',\n",
" 'Inventory': 'inventory',\n",
" 'Accounts Payable': 'accounts_payable',\n",
" 'Deferred Long Term Asset Charges': 'deferred_long_term_asset_charges',\n",
" 'Deferred Long Term Liab': 'deferred_long_term_liabilities',\n",
" 'Short Long Term Debt': 'short_long_term_debt',\n",
" 'Minority Interest': 'minority_interest'\n",
"}\n",
"\n",
"\n",
"dic_cols_rename_cashflow = {\n",
" 'Investments': 'investments',\n",
" 'Change To Liabilities': 'change_to_liabilities',\n",
" 'Total Cashflows From Investing Activities': 'total_cashflows_from_investing_activities',\n",
" 'Net Borrowings': 'net_borrowings',\n",
" 'Total Cash From Financing Activities': 'total_cash_from_financing_activities',\n",
" 'Change To Operating Activities': 'change_to_operating_activities',\n",
" 'Issuance Of Stock': 'issuance_of_stock',\n",
" 'Net Income': 'net_income',\n",
" 'Change In Cash': 'change_in_cash',\n",
" 'Repurchase Of Stock': 'repurchase_of_stock',\n",
" 'Effect Of Exchange Rate': 'effect_of_exchange_rate',\n",
" 'Total Cash From Operating Activities': 'total_cash_from_operating_activities',\n",
" 'Depreciation': 'depreciation',\n",
" 'Other Cashflows From Investing Activities': 'other_cashflows_from_investing_activities',\n",
" 'Dividends Paid': 'dividends_paid',\n",
" 'Change To Inventory': 'change_to_inventory',\n",
" 'Change To Account Receivables': 'change_to_account_receivables',\n",
" 'Other Cashflows From Financing Activities': 'other_cashflows_from_financing_activities',\n",
" 'Change To Netincome': 'change_to_net_income',\n",
" 'Capital Expenditures': 'capital_expenditures'\n",
"}\n",
"\n",
"\n",
"dic_cols_rename_earnings = {\n",
" 'Revenue': 'revenue',\n",
" 'Earnings': 'earnings'\n",
"}\n",
"\n",
"\n",
"dic_cols_rename_financials = {\n",
" 'Research Development': 'research_development',\n",
" 'Effect Of Accounting Charges': 'effect_oaccounting_charges',\n",
" 'Income Before Tax': 'income_before_tax',\n",
" 'Minority Interest': 'minority_interest',\n",
" 'Net Income': 'net_income',\n",
" 'Selling General Administrative': 'selling_general_administrative',\n",
" 'Gross Profit': 'gross_profit',\n",
" 'Ebit': 'ebit',\n",
" 'Operating Income': 'operating_income',\n",
" 'Other Operating Expenses': 'other_operating_expenses',\n",
" 'Interest Expense': 'interest_expense',\n",
" 'Extraordinary Items': 'extraordinary_items',\n",
" 'Non Recurring': 'non_recurring',\n",
" 'Other Items': 'other_items',\n",
" 'Income Tax Expense': 'income_tax_expense',\n",
" 'Total Revenue': 'total_revenue',\n",
" 'Total Operating Expenses': 'total_operating_expenses',\n",
" 'Cost Of Revenue': 'cost_orevenue',\n",
" 'Total Other Income Expense Net': 'total_other_income_expense_net',\n",
" 'Discontinued Operations': 'discontinued_operations',\n",
" 'Net Income From Continuing Ops': 'net_income_from_continuing_ops',\n",
" 'Net Income Applicable To Common Shares': 'net_income_applicable_to_common_shares'\n",
"}\n",
"\n",
"\n",
"dic_cols_rename_key_stats = {\n",
" 'annualHoldingsTurnover': 'annual_holdings_turnover',\n",
" 'enterpriseToRevenue': 'enterprise_to_revenue',\n",
" 'beta3Year': 'beta_3_year',\n",
" 'profitMargins': 'profit_margins',\n",
" 'enterpriseToEbitda': 'enterprise_to_ebita',\n",
" '52WeekChange': 'share_52_week_change',\n",
" 'morningStarRiskRating': 'morningstar_risk_rating',\n",
" 'forwardEps': 'forward_eps',\n",
" 'revenueQuarterlyGrowth': 'revenue_quarterly_growth',\n",
" 'sharesOutstanding': 'shares_outstanding',\n",
" 'fundInceptionDate': 'fund_inception_date',\n",
" 'annualReportExpenseRatio': 'annual_report_expense_ratio',\n",
" 'totalAssets': 'total_assets',\n",
" 'bookValue': 'book_value',\n",
" 'sharesShort': 'shares_short',\n",
" 'sharesPercentSharesOut': 'shares_percent_shares_out',\n",
" 'fundFamily': 'fund_family',\n",
" 'lastFiscalYearEnd': 'last_fiscal_year_end',\n",
" 'heldPercentInstitutions': 'held_percent_institutions',\n",
" 'netIncomeToCommon': 'net_income_to_common',\n",
" 'trailingEps': 'trailing_eps',\n",
" 'lastDividendValue': 'last_dividend_value',\n",
" 'SandP52WeekChange': 'sandp_52_week_change',\n",
" 'priceToBook': 'price_to_book',\n",
" 'heldPercentInsiders': 'held_percent_insiders',\n",
" 'nextFiscalYearEnd': 'next_fiscal_year_end',\n",
" 'yield': 'yield',\n",
" 'mostRecentQuarter': 'most_recent_quarter',\n",
" 'shortRatio': 'short_ratio',\n",
" 'sharesShortPreviousMonthDate': 'shares_short_previous_month_date',\n",
" 'floatShares': 'float_shares',\n",
" 'beta': 'beta',\n",
" 'enterpriseValue': 'enterprise_value',\n",
" 'priceHint': 'price_hint',\n",
" 'threeYearAverageReturn': 'three_year_average_return',\n",
" 'lastSplitDate': 'last_split_date',\n",
" 'lastSplitFactor': 'last_split_factor',\n",
" 'legalType': 'legal_type',\n",
" 'morningStarOverallRating': 'morning_star_overall_rating',\n",
" 'earningsQuarterlyGrowth': 'earnings_quarterly_growth',\n",
" 'priceToSalesTrailing12Months': 'price_to_sales_trailing_12_months',\n",
" 'dateShortInterest': 'date_short_interest',\n",
" 'pegRatio': 'peg_ratio',\n",
" 'ytdReturn': 'ytd_return',\n",
" 'forwardPE': 'forward_pe',\n",
" 'maxAge': 'max_age',\n",
" 'lastCapGain': 'last_cap_gain',\n",
" 'shortPercentOfFloat': 'shares_percent_of_float',\n",
" 'sharesShortPriorMonth': 'shares_short_prior_month',\n",
" 'category': 'category',\n",
" 'fiveYearAverageReturn': 'five_year_average_return'\n",
"}\n",
"\n",
"\n",
"ls_cols_yfinance_price = [\n",
" 'pr_key',\n",
" 'date',\n",
" 'entity_symbol',\n",
" 'region',\n",
" 'composite_symbol',\n",
" 'data_subcat',\n",
" 'table_type',\n",
" 'open',\n",
" 'high',\n",
" 'low',\n",
" 'close',\n",
" 'adj_close',\n",
" 'volume',\n",
" 'dividend',\n",
" 'split',\n",
" 'data_source',\n",
" 'last_updated'\n",
"]\n",
"\n",
"ls_cols_yfinance_balance_sheet = [\n",
" 'pr_key',\n",
" 'date',\n",
" 'entity_symbol',\n",
" 'region',\n",
" 'composite_symbol',\n",
" 'data_subcat',\n",
" 'table_type',\n",
" 'statement_period',\n",
" 'intangible_assets',\n",
" 'capital_surplus',\n",
" 'total_liabilities',\n",
" 'total_stockholder_equity',\n",
" 'other_current_liabilities',\n",
" 'total_assets',\n",
" 'common_stock',\n",
" 'other_current_assets',\n",
" 'retained_earnings',\n",
" 'other_liabilities',\n",
" 'good_will',\n",
" 'treasury_stock',\n",
" 'other_assets',\n",
" 'cash',\n",
" 'total_current_liabilities',\n",
" 'other_stockholder_equity',\n",
" 'property_plant_equipment',\n",
" 'total_current_assets',\n",
" 'short_term_investments',\n",
" 'long_term_investments',\n",
" 'net_tangible_assets',\n",
" 'net_receivables',\n",
" 'long_term_debt',\n",
" 'inventory',\n",
" 'accounts_payable',\n",
" 'deferred_long_term_asset_charges',\n",
" 'deferred_long_term_liabilities',\n",
" 'short_long_term_debt',\n",
" 'minority_interest',\n",
" 'data_source',\n",
" 'last_updated'\n",
"]\n",
"\n",
"ls_cols_yfinance_cashflow = [\n",
" 'pr_key',\n",
" 'date',\n",
" 'entity_symbol',\n",
" 'region',\n",
" 'composite_symbol',\n",
" 'data_subcat',\n",
" 'table_type',\n",
" 'statement_period',\n",
" 'investments',\n",
" 'change_to_liabilities',\n",
" 'total_cashflows_from_investing_activities',\n",
" 'net_borrowings',\n",
" 'total_cash_from_financing_activities',\n",
" 'change_to_operating_activities',\n",
" 'issuance_of_stock',\n",
" 'net_income',\n",
" 'change_in_cash',\n",
" 'repurchase_of_stock',\n",
" 'effect_of_exchange_rate',\n",
" 'total_cash_from_operating_activities',\n",
" 'depreciation',\n",
" 'other_cashflows_from_investing_activities',\n",
" 'dividends_paid',\n",
" 'change_to_inventory',\n",
" 'change_to_account_receivables',\n",
" 'other_cashflows_from_financing_activities',\n",
" 'change_to_net_income',\n",
" 'capital_expenditures',\n",
" 'data_source',\n",
" 'last_updated'\n",
"]\n",
"\n",
"ls_cols_yfinance_earnings = {\n",
" 'pr_key',\n",
" 'date',\n",
" 'entity_symbol',\n",
" 'region',\n",
" 'composite_symbol',\n",
" 'data_subcat',\n",
" 'table_type',\n",
" 'statement_period',\n",
" 'revenue',\n",
" 'earnings',\n",
" 'data_source',\n",
" 'last_updated'\n",
"}\n",
"\n",
"ls_cols_yfinance_financials = [\n",
" 'pr_key',\n",
" 'date',\n",
" 'entity_symbol',\n",
" 'region',\n",
" 'composite_symbol',\n",
" 'data_subcat',\n",
" 'table_type',\n",
" 'statement_period',\n",
" 'research_development',\n",
" 'effect_oaccounting_charges',\n",
" 'income_before_tax',\n",
" 'minority_interest',\n",
" 'net_income',\n",
" 'selling_general_administrative',\n",
" 'gross_profit',\n",
" 'ebit',\n",
" 'operating_income',\n",
" 'other_operating_expenses',\n",
" 'interest_expense',\n",
" 'extraordinary_items',\n",
" 'non_recurring',\n",
" 'other_items',\n",
" 'income_tax_expense',\n",
" 'total_revenue',\n",
" 'total_operating_expenses',\n",
" 'cost_orevenue',\n",
" 'total_other_income_expense_net',\n",
" 'discontinued_operations',\n",
" 'net_income_from_continuing_ops',\n",
" 'net_income_applicable_to_common_shares',\n",
" 'data_source',\n",
" 'last_updated'\n",
"]\n",
"\n",
"ls_cols_yahoofinance_key_stats = [\n",
" 'pr_key',\n",
" 'date',\n",
" 'entity_symbol',\n",
" 'region',\n",
" 'composite_symbol',\n",
" 'data_subcat',\n",
" 'table_type',\n",
" 'annual_holdings_turnover',\n",
" 'enterprise_to_revenue',\n",
" 'beta_3_year',\n",
" 'profit_margins',\n",
" 'enterprise_to_ebita',\n",
" 'share_52_week_change',\n",
" 'morningstar_risk_rating',\n",
" 'forward_eps',\n",
" 'revenue_quarterly_growth',\n",
" 'shares_outstanding',\n",
" 'fund_inception_date',\n",
" 'annual_report_expense_ratio',\n",
" 'total_assets',\n",
" 'book_value',\n",
" 'shares_short',\n",
" 'shares_percent_shares_out',\n",
" 'fund_family',\n",
" 'last_fiscal_year_end',\n",
" 'held_percent_institutions',\n",
" 'net_income_to_common',\n",
" 'trailing_eps',\n",
" 'last_dividend_value',\n",
" 'sandp_52_week_change',\n",
" 'price_to_book',\n",
" 'held_percent_insiders',\n",
" 'next_fiscal_year_end',\n",
" 'yield',\n",
" 'most_recent_quarter',\n",
" 'short_ratio',\n",
" 'shares_short_previous_month_date',\n",
" 'float_shares',\n",
" 'beta',\n",
" 'enterprise_value',\n",
" 'price_hint',\n",
" 'three_year_average_return',\n",
" 'last_split_date',\n",
" 'last_split_factor',\n",
" 'legal_type',\n",
" 'morning_star_overall_rating',\n",
" 'earnings_quarterly_growth',\n",
" 'price_to_sales_trailing_12_months',\n",
" 'date_short_interest',\n",
" 'peg_ratio',\n",
" 'ytd_return',\n",
" 'forward_pe',\n",
" 'max_age',\n",
" 'last_cap_gain',\n",
" 'shares_percent_of_float',\n",
" 'shares_short_prior_month',\n",
" 'category',\n",
" 'five_year_average_return',\n",
" 'data_source',\n",
" 'last_updated'\n",
"]\n",
"\n",
"\n",
"dic_cols_rename_ref = {\n",
" 'eod': dic_cols_rename_price,\n",
" 'intra': dic_cols_rename_price,\n",
" 'earnings': dic_cols_rename_earnings,\n",
" 'balance_sheet': dic_cols_rename_balance_sheet,\n",
" 'cashflow': dic_cols_rename_cashflow,\n",
" 'financials': dic_cols_rename_financials,\n",
" 'key_stats': dic_cols_rename_key_stats\n",
"}\n",
"\n",
"\n",
"dic_ls_cols_yfinance_ref = {\n",
" 'eod': ls_cols_yfinance_price,\n",
" 'intra': ls_cols_yfinance_price,\n",
" 'earnings': ls_cols_yfinance_earnings,\n",
" 'balance_sheet': ls_cols_yfinance_balance_sheet,\n",
" 'cashflow': ls_cols_yfinance_cashflow,\n",
" 'financials': ls_cols_yfinance_financials,\n",
"}\n",
"\n",
"\n",
"dic_ls_cols_yahoofinance_ref = {\n",
" 'key_stats': ls_cols_yahoofinance_key_stats\n",
"}\n",
"\n",
"\n",
"dic_ls_cols_ref = {\n",
" 'yfinance': dic_ls_cols_yfinance_ref,\n",
" 'yahoofinance': dic_ls_cols_yahoofinance_ref\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if sys.platform == \"win32\":\n",
" base_dir = r\"\\\\dir\" # dir for win machines\n",
"else:\n",
" base_dir = \"/dir\" # dir for non-win machines\n",
"\n",
"\n",
"dic_args_set = {\n",
"\n",
" 'args1': {'func': 'US_price_data_pull',\n",
" 'region': 'US',\n",
" 'data_cat': 'data',\n",
" 'data_subcat': 'entity_data',\n",
" 'data_type': 'price_data',\n",
" 'data_source': 'yfinance',\n",
" 'index_filter': None,\n",
" 'data_dir': os.path.join('{region}', '{data_cat}', '{data_subcat}', '{composite_symbol}',\n",
" '{entity_symbol}', '{data_type}', '{data_source}', '{table_type}'),\n",
" 'tracker_dir': os.path.join('US', 'tracker', 'entity_tracker', 'price_data', 'yfinance')},\n",
"\n",
" 'args2': {'func': 'US_fund_data_pull',\n",
" 'region': 'US',\n",
" 'data_cat': 'data',\n",
" 'data_subcat': 'entity_data',\n",
" 'data_type': 'fund_data',\n",
" 'data_source': 'yfinance',\n",
" 'index_filter': None,\n",
" 'data_dir': os.path.join('{region}', '{data_cat}', '{data_subcat}', '{composite_symbol}',\n",
" '{entity_symbol}', '{data_type}', '{data_source}', '{table_type}'),\n",
" 'tracker_dir': os.path.join('US', 'tracker', 'entity_tracker', 'fund_data', 'yfinance')},\n",
"\n",
" 'args3': {'func': 'US_fund_data_pull',\n",
" 'region': 'US',\n",
" 'data_cat': 'data',\n",
" 'data_subcat': 'entity_data',\n",
" 'data_type': 'fund_data',\n",
" 'data_source': 'yahoofinance',\n",
" 'index_filter': None,\n",
" 'data_dir': os.path.join('{region}', '{data_cat}', '{data_subcat}', '{composite_symbol}',\n",
" '{entity_symbol}', '{data_type}', '{data_source}', '{table_type}'),\n",
" 'tracker_dir': os.path.join('US', 'tracker', 'entity_tracker', 'fund_data', 'yahoofinance')},\n",
"\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for dic_args in dic_args_set.values():\n",
" creation_date = date.today().strftime('%Y%m%d')\n",
" log_file = creation_date + \"_\" + dic_args['func'] + \".log\"\n",
"\n",
" logs_dir = os.path.join(base_dir, '_logs')\n",
"\n",
" dic_dir = {'base': base_dir,\n",
" 'logs': logs_dir}\n",
"\n",
" check_or_create_dir(dic_dir['logs'])\n",
"\n",
" log_path = os.path.join(dic_dir['logs'], log_file)\n",
"\n",
" logging.basicConfig(filename=log_path, level=logging.INFO)\n",
"\n",
" log_str = \"{}({}, {}, {}, {})\".format(dic_args['func'], dic_args['region'], dic_args['data_cat'],\n",
" dic_args['data_subcat'], dic_args['data_source'])\n",
" \n",
" try:\n",
" print(\"\\n\" + \"STARTING: \" + log_str)\n",
" logging.info(\"\\n\" + \"STARTING: \" + log_str)\n",
"\n",
" pull_data(dic_dir, dic_args) \n",
"\n",
" print(\"COMPLETED: \" + log_str)\n",
" logging.info(\"COMPLETED: \" + log_str)\n",
"\n",
" except Exception as e:\n",
" print(str(e))\n",
" logging.error(str(e))\n",
"\n",
" print(\"FAILED: \" + log_str)\n",
" logging.error(\"FAILED: \" + log_str)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment