Skip to content

Instantly share code, notes, and snippets.

View ant358's full-sized avatar
🎯
Focusing

Anthony Wynne ant358

🎯
Focusing
  • Plymouth, UK
View GitHub Profile
def anova_machine(Cat_col, target_col, df):
"""ANOVA function. Provide the target variable column y, the main data set and a categorical column.
A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other.
Currently set for 95% confidence, will update later for higher significance setting."""
p_table = df.pivot(columns=Cat_col, values=target_col)
total_columns = len(p_table.columns)
total_rows = len(p_table)
def sig_num_columns(X_train, y_train, p_thres=0.05):
"""Which numerical features held in columns within the training data set are significantly correlated with
the target. Returns a dataframe with the column name and its p value. pvalue set to 0.05 for
95% confidence level enter a new p_thres if you want to change it. Only returns the significant columns
only pass numerical columns to the function! Other column types will return a shape error1"""
from scipy.stats import linregress
global sig_num
sig_num = {}
for col in X_train:
slope, intercept, rvalue, pvalue, stderr = linregress(X_train[col], y_train)
def replace_all_NaN(df):
""" If you are confident that numbers can be replaced with 0 and
objects can be replaced by No_columnname this function will do that over the whole data frame
will add more data types as I come across them. It prints info() when finished to check it
has captured them all"""
for col in df:
if df[col].dtype == 'object' and df[col].isna().sum() > 0:
df[col] = df[col].fillna('No_' + col)
elif df[col].dtype == 'float64' and df[col].isna().sum() > 0:
df[col] = df[col].fillna(0.0)
def first_n_pairs(dict_to_see, n):
""" Useful with large dictionarys to see what thae data looks like"""
a = {k: dict_to_see[k] for k in list(dict_to_see)[:n]}
print(a)
@ant358
ant358 / tweet_listener.py
Created January 5, 2019 17:33 — forked from hugobowne/tweet_listener.py
Here I define a Tweet listener that creates a file called 'tweets.txt', collects streaming tweets as .jsons and writes them to the file 'tweets.txt'; once 100 tweets have been streamed, the listener closes the file and stops listening.
class MyStreamListener(tweepy.StreamListener):
def __init__(self, api=None):
super(MyStreamListener, self).__init__()
self.num_tweets = 0
self.file = open("tweets.txt", "w")
def on_status(self, status):
tweet = status._json
self.file.write( json.dumps(tweet) + '\n' )
self.num_tweets += 1