Skip to content

Instantly share code, notes, and snippets.

@importdata
importdata / function to split and take the mean
Created May 24, 2020 19:32
Function to find the average when some have ranges (with "-"), while other's don't
# function to find the mean when some have ranges and others don't
def split_mean(x):
# split before and after the hyphen (-)
split_num = x.split("-")
if len(split_num) == 2:
return (float(split_num[0])+float(split_num[1]))/2
# those who aren't in the range
else:
return float(x)
@importdata
importdata / Remove whitespaces
Created May 24, 2020 19:08
Function to remove white spaces in the dataframe
# function to remove the leading and trailing whtte space in the data frame
def trim(dataset):
# using .strip() to remove the leading and the trailing white spaces in each cell
trim = lambda x: x.strip() if type(x) is str else x
return dataset.applymap(trim)
@importdata
importdata / CountVectorizer Example
Created May 18, 2020 02:16
Kaggle Twitter Data CountVectorizer Example
document = ["This is Import Data's YouTube channel",
"Data Science is my passion and it is fun",
"Please subscribe to my channel"]
# create the transform
vectorizer = CountVectorizer()
# tokenize and make the document into a matrix
doc_term_matrix = vectorizer.fit_transform(document)
pd.DataFrame(doc_term_matrix.toarray(),columns = vectorizer.get_feature_names())
@importdata
importdata / Kaggle Twitter Data Cleaning
Last active May 18, 2020 01:36
Kaggle Twitter Sentiment Analysis Data Cleaning
#install tweet-preprocessor to clean tweets
!pip install tweet-preprocessor
import re
#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
import preprocessor as p