This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to find the mean when some have ranges and others don't | |
def split_mean(x): | |
# split before and after the hyphen (-) | |
split_num = x.split("-") | |
if len(split_num) == 2: | |
return (float(split_num[0])+float(split_num[1]))/2 | |
# those who aren't in the range | |
else: | |
return float(x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to remove the leading and trailing whtte space in the data frame | |
def trim(dataset): | |
# using .strip() to remove the leading and the trailing white spaces in each cell | |
trim = lambda x: x.strip() if type(x) is str else x | |
return dataset.applymap(trim) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
document = ["This is Import Data's YouTube channel", | |
"Data Science is my passion and it is fun", | |
"Please subscribe to my channel"] | |
# create the transform | |
vectorizer = CountVectorizer() | |
# tokenize and make the document into a matrix | |
doc_term_matrix = vectorizer.fit_transform(document) | |
pd.DataFrame(doc_term_matrix.toarray(),columns = vectorizer.get_feature_names()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install tweet-preprocessor to clean tweets | |
!pip install tweet-preprocessor | |
import re | |
#set up punctuations we want to be replaced | |
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})") | |
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).") | |
import preprocessor as p |