"cells": [
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# system/os/regex and basic math functions\n",
"import os\n",
"import re\n",
"import sys\n",
"import math\n",
"import json\n",
"import time\n",
"import string\n",
"import dateutil\n",
"import datetime as dt\n",
"from itertools import chain"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set logging level\n",
"import logging\n",
" kwargs = {'level':getattr(logging, LOG_LEVEL)}\n",
"except NameError:\n",
" kwargs = {'level':logging.WARNING}\n",
" print('Set LOG_LEVEL=\"INFO\" before running the import file to get moar output.')\n",
" kwargs['format'] = LOG_FORMAT\n",
"except NameError:\n",
" kwargs['format'] = \"%(levelname)s::%(message)s\"\n",
" print('Set LOG_FORMAT to change log format.')\n",
"logger = logging.getLogger('notebook')\n",
"del kwargs\n",
"import warnings\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# IPython display convenience stuff\n",
" from IPython.display import HTML, display, display_html, display_javascript\n",
" from IPython import __version__ as ipythonversion\n",
" import ipywidgets\n",
" print(\"IPython: {}\".format(ipythonversion))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # numpy for matrix algebra\n",
" import numpy as np\n",
" os.environ['NUMEXPR_MAX_THREADS'] = '20'\n",
" print(\"Numpy (np): {}\".format(np.version.full_version))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # scipy for probability distributions and some statistical tests\n",
" import scipy as sp\n",
" import scipy.stats as stats\n",
" print(\"Scipy (sp, stats): {}\".format(sp.version.full_version))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # pandas for data manipulation\n",
" import pandas as pd\n",
" print(\"Pandas (pd): {}\".format(pd.__version__))\n",
" \n",
" def fmt_float(float_in, rstrip0s=re.compile(r'\\.0+$')):\n",
" try:\n",
" return rstrip0s.sub('', '{0:,.{1}f}'.format(float_in, 3 - 3 * bool(abs(float_in) // 1000)))\n",
" except Exception:\n",
" return str(float_in)\n",
" pd.set_option('float_format', fmt_float)\n",
" pd.set_option('display.max_rows', 250)\n",
" pd.set_option('display.max_columns', 250)\n",
" pd.set_option('display.notebook_repr_html', True)\n",
" \n",
" def latex_format(num_in):\n",
" \"\"\"Format numbers for Latex tables\"\"\"\n",
" try:\n",
" num_in = float(num_in)\n",
" num_dig = np.log10(abs(num_in)) + 1\n",
" if num_in == 0:\n",
" return \"0\"\n",
" if num_dig >= 3:\n",
" return f\"{int(num_in):,d}\"\n",
" elif num_dig >= 1:\n",
" return f\"{num_in:2.1f}\"\n",
" return f\"{num_in:1.3f}\"\n",
" except ValueError:\n",
" return str(num_in)\n",
" \n",
" def S(df, cols=None, keep_dups=False):\n",
" \"\"\"S splits strings, and if called with a df input, interpolates variable names.\n",
" \n",
" Example::\n",
" S('gvkey datadate') # --> ['gvkey', 'datadate']\n",
" df.S('gvk* datad* num*') # --> ['gvkey', 'datadate', 'num_words', 'num_sentences']\n",
" \"\"\"\n",
" if isinstance(df, str):\n",
" cols = df\n",
" if isinstance(cols, str):\n",
" new_cols = []\n",
" for col in cols.split():\n",
" if '*' in col or '?' in col:\n",
" matcher = re.compile(r'\\b'+col.replace('*', '.*').replace('?', '.')+r'\\b', re.I)\n",
" new_cols.extend([c for c in df.columns if])\n",
" else:\n",
" new_cols.append(col)\n",
" cols = new_cols\n",
" return cols if keep_dups else list(dict(zip(cols, cols)))\n",
" \n",
" # monkeypatch C into DataFrame\n",
" pd.DataFrame.S = S\n",
" \n",
" def hugetable(df, soft_max=5000, hard_max=100_000):\n",
" max_rows = pd.options.display.max_rows\n",
" max_columns = pd.options.display.max_columns\n",
" pd.options.display.max_rows = min(soft_max, 100_000)\n",
" pd.options.display.max_columns = min(soft_max, 100_000)\n",
" display_html(df)\n",
" pd.options.display.max_rows = max_rows\n",
" pd.options.display.max_columns = max_columns\n",
"except (ImportError, ModuleNotFoundError):\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # matplotlib for plotting and pyplot for MATLAB-style API\n",
" import matplotlib as mpl\n",
" import matplotlib.pyplot as plt\n",
" plt.rcParams['figure.figsize'] = (15, 5) \n",
" print(\"MatPlotLib (mpl, plt): {}\".format(mpl.__version__))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Seaborn for pretty plotting\n",
" import seaborn as sns\n",
" print(\"Seaborn (sns): {}\".format(sns.__version__))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Scikit Learn for more regressions\n",
" import sklearn as sk\n",
" print(\"Scikit-Learn (sk): {}\".format(sk.__version__))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # statsmodels for econometrics\n",
" import statsmodels.api as sm\n",
" import statsmodels.formula.api as smf\n",
" print(\"Statsmodels (sm,smf): {}\".format(sm.__version__))\n",
"except (ImportError, AttributeError):\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # patsy for making formulas\n",
" import patsy as pt\n",
" print(\"Patsy (pt): {}\".format(pt.__version__))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # SQLAlchemy for relational db management\n",
" import sqlalchemy as sa\n",
" print(\"SQLAlchemy (sa): {}\".format(sa.__version__))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Gensim for textual analysis\n",
" import gensim\n",
" print(\"Gensim: {}\".format(gensim.__version__))\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # TQDM for progress bar outputs\n",
" from tqdm.notebook import tqdm\n",
"except ImportError:\n",
" def tqdm(thing, *args, **kwargs):\n",
" return thing"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # sas7bdat for reading SAS created databases\n",
" from sas7bdat import SAS7BDAT as SASdb\n",
" print(\"SAS8BDAT (SASdb): unknown version\")\n",
" SAS_ZERO = dt.datetime(1960,1,1)\n",
" \n",
" def sas_date_to_datetime(df_col):\n",
" return pd.to_timedelta(df_col, unit='d') + SAS_ZERO\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # BeautifulSoup for HTML things\n",
" from bs4 import BeautifulSoup\n",
"except ImportError:\n",
" pass"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" from pyedgar.utilities import edgarweb\n",
"except (ImportError, ModuleNotFoundError):\n",
" class _o_(object):\n",
" def edgar_links(*args, **kwargs):\n",
" return ''\n",
" edgarweb = _o_()"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Some nice date constants\n",
"MIN_DATE = dt.datetime(1900, 1, 1)\n",
"MAX_DATE =\n",
"TD_DAY = pd.Timedelta(days=1)\n",
"TD_YEAR = pd.Timedelta(days=1) * 365"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# print(\"linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession')\")\n",
"def linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession', return_df=False):\n",
" \"\"\"\n",
" Displays top rows of a dataframe, and includes\n",
" links to the HTML and FTP websites if CIK and Accession are found.\n",
" \"\"\"\n",
" if len(df) == 0:\n",
" display_html(df[fields or df.columns].assign(link='').to_html(), raw=True)\n",
" return\n",
" \n",
" w = pd.get_option('display.max_colwidth')\n",
" pd.set_option('display.max_colwidth', None)\n",
" \n",
" if fields is None:\n",
" fields = list(df.columns)\n",
" \n",
" dfn = df.head(n).copy() \n",
" \n",
" if cik in dfn.columns:\n",
" linkstr, i = 'links', 0\n",
" while linkstr in dfn.columns:\n",
" linkstr = 'links%d' % i\n",
" i += 1\n",
" dfn[linkstr] = dfn.apply(lambda row: edgarweb.edgar_links(row[cik], row[accession]), axis=1)\n",
" fields.append(linkstr)\n",
" \n",
" html = f\"<h4>{title}</h4>\" if title else ''\n",
" html += dfn[fields].to_html(escape=False, index=False, na_rep=\"\")\n",
" \n",
" display_html(html, raw=True)\n",
" pd.set_option('display.max_colwidth', w)\n",
" \n",
" if return_df: \n",
" return dfn"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# print(\"timehist(dtseries_or_df, time_variable='year', y_tic_number=4, x_tic_skip=0, *args, **kwargs)\")\n",
"def timehist(dtseries_or_df, time_variable='year',\n",
" y_tic_number=4, x_tic_skip=0,\n",
" width=.9, ax=None, skip_retick=None,\n",
" label=None, \n",
" *args, **kwargs):\n",
" \"\"\"\n",
" Historgam of observations per time period.\n",
" First tries: dtseries_or_df.dt.time_variable\n",
" Failing that, does dtseries_or_df.value_counts()\n",
" Sends args and kwargs to figure.\n",
" \"\"\"\n",
" if ax is not None and skip_retick is None:\n",
" skip_retick = True\n",
" skip_retick = skip_retick or False\n",
" \n",
" x_tic_skip += 1\n",
" \n",
" if not skip_retick:\n",
" sns.set_style('darkgrid')\n",
" sns.set_context('talk', rc={'patch.linewidth': 0, 'patch.edgecolor': 'k', 'patch.facecolor': 'k'})\n",
" \n",
" _d = dtseries_or_df\n",
" try:\n",
" _d = _d.dt.__getattribute__(time_variable)\n",
" except:\n",
" try:\n",
" _d = _d[time_variable]\n",
" except:\n",
" pass\n",
" _g = _d.value_counts().sort_index()\n",
" if len(_g) > 1000:\n",
" logger.error(\"ERROR: You are trying to plot something with too many levels. Don't do that.\")\n",
" return \n",
" \n",
" if ax is None:\n",
" if 'figsize' not in kwargs:\n",
" kwargs['figsize'] = (13,2)\n",
" plt.figure(*args, **kwargs)\n",
" ax = plt.gca()\n",
" # If ax is none, assume kwargs are for figure generation.\n",
" kwargs = {}\n",
" \n",
", _g, width=width, label=label, **kwargs)\n",
" \n",
" if not skip_retick:\n",
" # Format and label X axis\n",
" ax.set_xlim(left=_g.index.min()-0.5, right=_g.index.max()+0.5)\n",
" _t = _g.index[::x_tic_skip]\n",
" ax.set_xticks(_t)\n",
" ax.set_xticklabels(map(str, _t), rotation='vertical')\n",
" # Label Y Axis\n",
" tene = math.log10(_g.max())//1-1\n",
" topnum = math.ceil(_g.max() / 10**tene)\n",
" ax.set_yticks([(topnum * i // y_tic_number)*10**tene for i in range(y_tic_number, 0, -1)])\n",
" \n",
" return ax"
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def savefig(file_name, fig, *args, default_root='.', make_name_unique=False, **kwargs):\n",
" \"\"\"\n",
" Save figure to .\n",
" \n",
" If no extension is added, PNG is assumed (at default 300 DPI).\n",
" \n",
" If make_name_unique flag is True, Y-M-D_H-M-S is added to filename.\n",
" \n",
" Returns file path that was created.\n",
" \"\"\"\n",
" _fname, _ext = os.path.splitext(file_name)\n",
" if not _ext:\n",
" _ext = '.png'\n",
" \n",
" file_path = os.path.join(default_root, _fname + _ext)\n",
" \n",
" if make_name_unique:\n",
" file_path = os.path.join(default_root, f\"{_fname}_{}{_ext}\")\n",
" \n",
" default_kwargs = {\n",
" 'bbox_inches': 'tight',\n",
" 'pad_inches': 0.1,\n",
" 'transparent': True\n",
" }\n",
" \n",
" kwargs = {**default_kwargs, **kwargs}\n",
" \n",
" if _ext == '.png':\n",
" if 'dpi' not in kwargs:\n",
" kwargs['dpi'] = 300\n",
" \n",
" fig.savefig(file_path, *args, **kwargs)\n",
" \n",
" return file_path"
