Anthony Wynne ant358

## anova_machine.py
def anova_machine(Cat_col, target_col, df):
    """ANOVA function.  Provide the target variable column y, the main data set and a categorical column.
    A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other.
    Currently set for 95% confidence, will update later for higher significance setting."""

    p_table = df.pivot(columns=Cat_col, values=target_col)

    total_columns = len(p_table.columns)

    total_rows = len(p_table)

## sig_num_columns.py
def sig_num_columns(X_train, y_train, p_thres=0.05):
    """Which numerical features held in columns within the training data set are significantly correlated with
    the target. Returns a dataframe with the column name and its p value. pvalue set to 0.05 for
    95% confidence level enter a new p_thres if you want to change it. Only returns the significant columns
    only pass numerical columns to the function! Other column types will return a shape error1"""
    from scipy.stats import linregress
    global sig_num
    sig_num = {}
    for col in X_train:
        slope, intercept, rvalue, pvalue, stderr = linregress(X_train[col], y_train)

## replace_all_NaN.py
def replace_all_NaN(df):
    """ If you are confident that numbers can be replaced with 0 and
    objects can be replaced by No_columnname this function will do that over the whole data frame
    will add more data types as I come across them. It prints info() when finished to check it
    has captured them all"""
    for col in df:
        if df[col].dtype == 'object' and df[col].isna().sum() > 0:
            df[col] = df[col].fillna('No_' + col)
        elif df[col].dtype == 'float64' and df[col].isna().sum() > 0:
            df[col] = df[col].fillna(0.0)

## first_n_pairs.py
def first_n_pairs(dict_to_see, n):
    """ Useful with large dictionarys to see what thae data looks like"""
    a = {k: dict_to_see[k] for k in list(dict_to_see)[:n]}
    print(a)

## tweet_listener.py
class MyStreamListener(tweepy.StreamListener):
    def __init__(self, api=None):
        super(MyStreamListener, self).__init__()
        self.num_tweets = 0
        self.file = open("tweets.txt", "w")

    def on_status(self, status):
        tweet = status._json
        self.file.write( json.dumps(tweet) + '\n' )
        self.num_tweets += 1
	def anova_machine(Cat_col, target_col, df):
	"""ANOVA function. Provide the target variable column y, the main data set and a categorical column.
	A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other.
	Currently set for 95% confidence, will update later for higher significance setting."""

	p_table = df.pivot(columns=Cat_col, values=target_col)

	total_columns = len(p_table.columns)

	total_rows = len(p_table)
	def sig_num_columns(X_train, y_train, p_thres=0.05):
	"""Which numerical features held in columns within the training data set are significantly correlated with
	the target. Returns a dataframe with the column name and its p value. pvalue set to 0.05 for
	95% confidence level enter a new p_thres if you want to change it. Only returns the significant columns
	only pass numerical columns to the function! Other column types will return a shape error1"""
	from scipy.stats import linregress
	global sig_num
	sig_num = {}
	for col in X_train:
	slope, intercept, rvalue, pvalue, stderr = linregress(X_train[col], y_train)
	def replace_all_NaN(df):
	""" If you are confident that numbers can be replaced with 0 and
	objects can be replaced by No_columnname this function will do that over the whole data frame
	will add more data types as I come across them. It prints info() when finished to check it
	has captured them all"""
	for col in df:
	if df[col].dtype == 'object' and df[col].isna().sum() > 0:
	df[col] = df[col].fillna('No_' + col)
	elif df[col].dtype == 'float64' and df[col].isna().sum() > 0:
	df[col] = df[col].fillna(0.0)
	def first_n_pairs(dict_to_see, n):
	""" Useful with large dictionarys to see what thae data looks like"""
	a = {k: dict_to_see[k] for k in list(dict_to_see)[:n]}
	print(a)
	class MyStreamListener(tweepy.StreamListener):
	def __init__(self, api=None):
	super(MyStreamListener, self).__init__()
	self.num_tweets = 0
	self.file = open("tweets.txt", "w")

	def on_status(self, status):
	tweet = status._json
	self.file.write( json.dumps(tweet) + '\n' )
	self.num_tweets += 1