vincefav/bamboo.py

## bamboo.py
def clean_column_names(cols):
    '''
    Pass your dataframe's columns into this function and it'll clean up their names.
    Sample usage:
    data.columns = clean_column_names(data.columns)
    '''

    from string import punctuation
    cols = cols.str.strip().str.lower()
    for i in list(punctuation):
        if i != '_':
            cols = cols.str.replace(i, '')

    cols = cols.str.replace(" ", '_')
    return cols

# Btw, it's unusual but not necessarily bad to import libraries inside your functions.
# I'm doing it to make my code more copy-pasteable.

def absolute_correlations(col, df=data):
    ''' Sorts correlations by their absolute values (biggest appear up top
    regardless of positive or negative '''

    corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
    corrs['absol'] = np.abs(corrs['correlation'])
    return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

def cronbach_alpha(df):
    '''pass this a dataframe of related test items'''
    item_vars = df.var(axis=0, ddof=1)
    t_scores = df.sum(axis=1)
    n_items = len(df.columns)

    return (n_items / (n_items-1)) * (1 - (item_vars.sum() / t_scores.var(ddof=1)))

# Monkey-patches the dataframe so you can return numeric columns a little faster
def numeric(self):
    return self.select_dtypes(include=[np.number])
pd.DataFrame.numeric = numeric

# Monkey-patches pandas to include a .zscore() and .normalize() method
def zscore(self):
    return (self - self.mean()) / self.std()

def normalize(self):
    return (self - self.min()) / (self.max() - self.min())

pd.DataFrame.zscore = zscore
pd.Series.zscore = zscore
pd.DataFrame.normalize = normalize
pd.Series.normalize = normalize

def correlation_matrix(df, figsize=(15,7)):
    ''' Makes a pretty heatmap of correlations '''

    matrix = df.corr()
    matrix = matrix[matrix.columns[1:]]
    matrix = matrix.tail(len(matrix)-1)


    # Generate a mask for the upper triangle
    mask = np.zeros_like(matrix, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True


    # Resize and display
    plt.figure(figsize=figsize)
    sns.heatmap(matrix, annot=True, fmt='.2f', center=0, mask=mask, cmap='seismic_r')


def tts(x_data=x, y_data=y, test_size=.2):
    ''' NOT recommended, but quickly splits your training and testing data '''

    from sklearn.model_selection import train_test_split
    global xtrain
    global xtest
    global ytrain
    global ytest
    xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data, test_size=test_size)


from scipy.spatial.distance import cosine
def cosine_similarity(a, b):
    '''1 - cosine distance'''
    return 1 - cosine(a, b)


from nltk import pos_tag
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer, PorterStemmer

def prepare_text(text, aggressiveness=2):
    '''normalizes, tokenizes, and lemmatizes input text'''
    text = text.lower()

    for i in punctuation:
        text = text.replace(i, ' ')

    words = word_tokenize(text)

    words = [i for i in words if i not in stopwords.words('english')]


    for i in 'rasvn':
        try:
            words = [WordNetLemmatizer().lemmatize(w, pos=i) for w in words]
        except:
            pass
    if aggressiveness > 0:
        if aggressiveness == 1:
            st = PorterStemmer()
        elif aggressiveness == 2:
            st = SnowballStemmer('english')
        elif aggressiveness > 2:
            st = LancasterStemmer()
        words = [st.stem(w) for w in words]
    return words


def sort_dict(user_dict, ascending=False):
  ''' Sorts a dictionary by its values '''
	import operator
	return sorted(user_dict.items(), key=operator.itemgetter(1), reverse=not ascending)
	def clean_column_names(cols):
	'''
	Pass your dataframe's columns into this function and it'll clean up their names.
	Sample usage:
	data.columns = clean_column_names(data.columns)
	'''

	from string import punctuation
	cols = cols.str.strip().str.lower()
	for i in list(punctuation):
	if i != '_':
	cols = cols.str.replace(i, '')

	cols = cols.str.replace(" ", '_')
	return cols

	# Btw, it's unusual but not necessarily bad to import libraries inside your functions.
	# I'm doing it to make my code more copy-pasteable.

	def absolute_correlations(col, df=data):
	''' Sorts correlations by their absolute values (biggest appear up top
	regardless of positive or negative '''

	corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
	corrs['absol'] = np.abs(corrs['correlation'])
	return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

	def cronbach_alpha(df):
	'''pass this a dataframe of related test items'''
	item_vars = df.var(axis=0, ddof=1)
	t_scores = df.sum(axis=1)
	n_items = len(df.columns)

	return (n_items / (n_items-1)) * (1 - (item_vars.sum() / t_scores.var(ddof=1)))

	# Monkey-patches the dataframe so you can return numeric columns a little faster
	def numeric(self):
	return self.select_dtypes(include=[np.number])
	pd.DataFrame.numeric = numeric

	# Monkey-patches pandas to include a .zscore() and .normalize() method
	def zscore(self):
	return (self - self.mean()) / self.std()

	def normalize(self):
	return (self - self.min()) / (self.max() - self.min())

	pd.DataFrame.zscore = zscore
	pd.Series.zscore = zscore
	pd.DataFrame.normalize = normalize
	pd.Series.normalize = normalize

	def correlation_matrix(df, figsize=(15,7)):
	''' Makes a pretty heatmap of correlations '''

	matrix = df.corr()
	matrix = matrix[matrix.columns[1:]]
	matrix = matrix.tail(len(matrix)-1)


	# Generate a mask for the upper triangle
	mask = np.zeros_like(matrix, dtype=np.bool)
	mask[np.triu_indices_from(mask)] = True


	# Resize and display
	plt.figure(figsize=figsize)
	sns.heatmap(matrix, annot=True, fmt='.2f', center=0, mask=mask, cmap='seismic_r')




	def tts(x_data=x, y_data=y, test_size=.2):
	''' NOT recommended, but quickly splits your training and testing data '''

	from sklearn.model_selection import train_test_split
	global xtrain
	global xtest
	global ytrain
	global ytest
	xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data, test_size=test_size)


	from scipy.spatial.distance import cosine
	def cosine_similarity(a, b):
	'''1 - cosine distance'''
	return 1 - cosine(a, b)


	from nltk import pos_tag
	from string import punctuation
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer
	from nltk.stem import SnowballStemmer, PorterStemmer

	def prepare_text(text, aggressiveness=2):
	'''normalizes, tokenizes, and lemmatizes input text'''
	text = text.lower()

	for i in punctuation:
	text = text.replace(i, ' ')

	words = word_tokenize(text)

	words = [i for i in words if i not in stopwords.words('english')]


	for i in 'rasvn':
	try:
	words = [WordNetLemmatizer().lemmatize(w, pos=i) for w in words]
	except:
	pass
	if aggressiveness > 0:
	if aggressiveness == 1:
	st = PorterStemmer()
	elif aggressiveness == 2:
	st = SnowballStemmer('english')
	elif aggressiveness > 2:
	st = LancasterStemmer()
	words = [st.stem(w) for w in words]
	return words


	def sort_dict(user_dict, ascending=False):
	''' Sorts a dictionary by its values '''
	import operator
	return sorted(user_dict.items(), key=operator.itemgetter(1), reverse=not ascending)