Skip to content

Instantly share code, notes, and snippets.

@vincefav
Last active November 4, 2020 21:39
Show Gist options
  • Save vincefav/3eedfc562de6c77c88708016c0ac58d0 to your computer and use it in GitHub Desktop.
Save vincefav/3eedfc562de6c77c88708016c0ac58d0 to your computer and use it in GitHub Desktop.
Common data science tasks
def clean_column_names(cols):
'''
Pass your dataframe's columns into this function and it'll clean up their names.
Sample usage:
data.columns = clean_column_names(data.columns)
'''
from string import punctuation
cols = cols.str.strip().str.lower()
for i in list(punctuation):
if i != '_':
cols = cols.str.replace(i, '')
cols = cols.str.replace(" ", '_')
return cols
# Btw, it's unusual but not necessarily bad to import libraries inside your functions.
# I'm doing it to make my code more copy-pasteable.
def absolute_correlations(col, df=data):
''' Sorts correlations by their absolute values (biggest appear up top
regardless of positive or negative '''
corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
corrs['absol'] = np.abs(corrs['correlation'])
return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)
def cronbach_alpha(df):
'''pass this a dataframe of related test items'''
item_vars = df.var(axis=0, ddof=1)
t_scores = df.sum(axis=1)
n_items = len(df.columns)
return (n_items / (n_items-1)) * (1 - (item_vars.sum() / t_scores.var(ddof=1)))
# Monkey-patches the dataframe so you can return numeric columns a little faster
def numeric(self):
return self.select_dtypes(include=[np.number])
pd.DataFrame.numeric = numeric
# Monkey-patches pandas to include a .zscore() and .normalize() method
def zscore(self):
return (self - self.mean()) / self.std()
def normalize(self):
return (self - self.min()) / (self.max() - self.min())
pd.DataFrame.zscore = zscore
pd.Series.zscore = zscore
pd.DataFrame.normalize = normalize
pd.Series.normalize = normalize
def correlation_matrix(df, figsize=(15,7)):
''' Makes a pretty heatmap of correlations '''
matrix = df.corr()
matrix = matrix[matrix.columns[1:]]
matrix = matrix.tail(len(matrix)-1)
# Generate a mask for the upper triangle
mask = np.zeros_like(matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Resize and display
plt.figure(figsize=figsize)
sns.heatmap(matrix, annot=True, fmt='.2f', center=0, mask=mask, cmap='seismic_r')
def tts(x_data=x, y_data=y, test_size=.2):
''' NOT recommended, but quickly splits your training and testing data '''
from sklearn.model_selection import train_test_split
global xtrain
global xtest
global ytrain
global ytest
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data, test_size=test_size)
from scipy.spatial.distance import cosine
def cosine_similarity(a, b):
'''1 - cosine distance'''
return 1 - cosine(a, b)
from nltk import pos_tag
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer, PorterStemmer
def prepare_text(text, aggressiveness=2):
'''normalizes, tokenizes, and lemmatizes input text'''
text = text.lower()
for i in punctuation:
text = text.replace(i, ' ')
words = word_tokenize(text)
words = [i for i in words if i not in stopwords.words('english')]
for i in 'rasvn':
try:
words = [WordNetLemmatizer().lemmatize(w, pos=i) for w in words]
except:
pass
if aggressiveness > 0:
if aggressiveness == 1:
st = PorterStemmer()
elif aggressiveness == 2:
st = SnowballStemmer('english')
elif aggressiveness > 2:
st = LancasterStemmer()
words = [st.stem(w) for w in words]
return words
def sort_dict(user_dict, ascending=False):
''' Sorts a dictionary by its values '''
import operator
return sorted(user_dict.items(), key=operator.itemgetter(1), reverse=not ascending)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment