Jay-Jay-D/cointegration_test.py

## cointegration_test.py
def cointegration_test(df_data, start_date=None, end_date=None, verbose=False):
    """This method test for correlation and cointegration all the combinations
     of stocks in the period defined by start_date and end_date.

     Following the paper "High Frecuency and Dynamic Pairs Trading Based on
     Statistical Arbitrage Using a Two-Stage Correlation and Cointegration Approach",
     the method first test if the correlation has a pearson coefficient bigger than 0.9.
     Then run a simple cointegration test and if the p-values is less than 0.1, then run
     an Augmented Dickey–Fuller test.

    Args:
        df_data (pandas.DataFrame):
            A DataFrame with the close prices for each stock/security.
            - Example:
              	index       AAPL	  MSFT    ...   SBUX
                2014-01-09	79.18	  35.53   ...   38.8
                2014-01-10	79.62	  35.76   ...   39.02
                2014-01-11	78.06	  36.41   ...   38.6
                2014-01-12	76.53	  36.13   ...   38.08

        start_date (Optional[datetime.date]):
            The start date of the period to analyze, if None, then is equal
            to the df_data first observation date. Defaults to None.
        end_date (Optional[datetime.date]):
            The end date of the period to analyze, if None, then is equal
            to the df_data last observation date. Defaults to None.
        verbose (bool):
            If True, print information about the pairs being tested and the test results.
            Default: False.

    Returns:
        pandas.DataFrame:
            A DataFrame with the cointegrated pairs as index and as columns:
                - The pearson coefficient of the correlation
                - The test_value of the ADF Test
                - The p_value of the ADF Test
                - The gamma coefficient of the regession.
                - The sd residuals of the regression.
            - Example:
                             ADF_test_value     gamma   p_value   pearson  sd_residuals
            Stock_A Stock_B
            ESV     NE            -3.336918  1.630184  0.013307  0.976602      0.764803
                    CNQ           -3.464779  1.601749  0.008939  0.958392      0.971339
            RDC     CNQ           -2.833936  1.016649  0.053587  0.931975      0.792285
            SGY     VLO           -4.638204  1.215127  0.000110  0.947521      1.055146
    """
    dt_start_date = start_date if start_date is not None else df_data.index[0].date()
    dt_end_date = end_date if end_date is not None else df_data.index[-1].date()
    cointegrated_pairs = []
    pearson_coeff = []
    p_value = []
    test_value = []
    gamma_coeff = []
    res_sd = []
    counter = 0
    st_start_date = dt_start_date.strftime("%Y-%m")
    st_end_date = dt_end_date.strftime("%Y-%m")
    ticket_list = df_data.columns.tolist()
    for stock_pair in combinations(ticket_list, 2):
        X = df_data.ix[st_start_date:st_end_date, stock_pair[0]]
        Y = df_data.ix[st_start_date:st_end_date, stock_pair[1]]
        if verbose: print "Testing pair: ", stock_pair
        corr_coeff = pearsonr(X, Y)[0]
        if corr_coeff >= 0.9:
            if verbose: print "\t=> Pair Correlated"
            if coint(X, Y)[1] < 0.1:
                if verbose: print "\t=> Pair Cointegrated"
                reg = sm.OLS(X, Y).fit()
                res = reg.resid
                ADF_test = adfuller(res)
                if ADF_test[1] < 0.1:
                    if verbose: print "\t=> ADF test passed"
                    cointegrated_pairs.append(stock_pair)
                    p_value.append(ADF_test[1])
                    test_value.append(ADF_test[0])
                    gamma_coeff.append(reg.params[0])
                    pearson_coeff.append(corr_coeff)
                    res_sd.append(res.std())
                    counter += 1
    if verbose: print "From {0} to {1}, {2} cointegrated pairs were found\n".format(dt_start_date.strftime("%Y-%m-%d"),
                                                                                    dt_end_date.strftime("%Y-%m-%d"),
                                                                                    counter)
    if counter != 0:
        index = pd.MultiIndex.from_tuples(cointegrated_pairs, names=['Stock_A', 'Stock_B'])
        df = pd.DataFrame(data={'pearson':pearson_coeff, 'ADF_test_value':test_value, 'p_value':p_value,
                                'gamma':gamma_coeff, 'sd_residuals':res_sd}, index=index)
        if verbose: print df, '\n'
        return df
	def cointegration_test(df_data, start_date=None, end_date=None, verbose=False):
	"""This method test for correlation and cointegration all the combinations
	of stocks in the period defined by start_date and end_date.

	Following the paper "High Frecuency and Dynamic Pairs Trading Based on
	Statistical Arbitrage Using a Two-Stage Correlation and Cointegration Approach",
	the method first test if the correlation has a pearson coefficient bigger than 0.9.
	Then run a simple cointegration test and if the p-values is less than 0.1, then run
	an Augmented Dickey–Fuller test.

	Args:
	df_data (pandas.DataFrame):
	A DataFrame with the close prices for each stock/security.
	- Example:
	index AAPL MSFT ... SBUX
	2014-01-09 79.18 35.53 ... 38.8
	2014-01-10 79.62 35.76 ... 39.02
	2014-01-11 78.06 36.41 ... 38.6
	2014-01-12 76.53 36.13 ... 38.08

	start_date (Optional[datetime.date]):
	The start date of the period to analyze, if None, then is equal
	to the df_data first observation date. Defaults to None.
	end_date (Optional[datetime.date]):
	The end date of the period to analyze, if None, then is equal
	to the df_data last observation date. Defaults to None.
	verbose (bool):
	If True, print information about the pairs being tested and the test results.
	Default: False.

	Returns:
	pandas.DataFrame:
	A DataFrame with the cointegrated pairs as index and as columns:
	- The pearson coefficient of the correlation
	- The test_value of the ADF Test
	- The p_value of the ADF Test
	- The gamma coefficient of the regession.
	- The sd residuals of the regression.
	- Example:
	ADF_test_value gamma p_value pearson sd_residuals
	Stock_A Stock_B
	ESV NE -3.336918 1.630184 0.013307 0.976602 0.764803
	CNQ -3.464779 1.601749 0.008939 0.958392 0.971339
	RDC CNQ -2.833936 1.016649 0.053587 0.931975 0.792285
	SGY VLO -4.638204 1.215127 0.000110 0.947521 1.055146
	"""
	dt_start_date = start_date if start_date is not None else df_data.index[0].date()
	dt_end_date = end_date if end_date is not None else df_data.index[-1].date()
	cointegrated_pairs = []
	pearson_coeff = []
	p_value = []
	test_value = []
	gamma_coeff = []
	res_sd = []
	counter = 0
	st_start_date = dt_start_date.strftime("%Y-%m")
	st_end_date = dt_end_date.strftime("%Y-%m")
	ticket_list = df_data.columns.tolist()
	for stock_pair in combinations(ticket_list, 2):
	X = df_data.ix[st_start_date:st_end_date, stock_pair[0]]
	Y = df_data.ix[st_start_date:st_end_date, stock_pair[1]]
	if verbose: print "Testing pair: ", stock_pair
	corr_coeff = pearsonr(X, Y)[0]
	if corr_coeff >= 0.9:
	if verbose: print "\t=> Pair Correlated"
	if coint(X, Y)[1] < 0.1:
	if verbose: print "\t=> Pair Cointegrated"
	reg = sm.OLS(X, Y).fit()
	res = reg.resid
	ADF_test = adfuller(res)
	if ADF_test[1] < 0.1:
	if verbose: print "\t=> ADF test passed"
	cointegrated_pairs.append(stock_pair)
	p_value.append(ADF_test[1])
	test_value.append(ADF_test[0])
	gamma_coeff.append(reg.params[0])
	pearson_coeff.append(corr_coeff)
	res_sd.append(res.std())
	counter += 1
	if verbose: print "From {0} to {1}, {2} cointegrated pairs were found\n".format(dt_start_date.strftime("%Y-%m-%d"),
	dt_end_date.strftime("%Y-%m-%d"),
	counter)
	if counter != 0:
	index = pd.MultiIndex.from_tuples(cointegrated_pairs, names=['Stock_A', 'Stock_B'])
	df = pd.DataFrame(data={'pearson':pearson_coeff, 'ADF_test_value':test_value, 'p_value':p_value,
	'gamma':gamma_coeff, 'sd_residuals':res_sd}, index=index)
	if verbose: print df, '\n'
	return df