Jaemin Lee importdata

## function to split and take the mean
# function to find the mean when some have ranges and others don't
def split_mean(x):
  # split before and after the hyphen (-)
  split_num = x.split("-")
  if len(split_num) == 2:
     return (float(split_num[0])+float(split_num[1]))/2
  # those who aren't in the range
  else:
     return float(x)

## Remove whitespaces
# function to remove the leading and trailing whtte space in the data frame
def trim(dataset):
  # using .strip() to remove the leading and the trailing white spaces in each cell
  trim = lambda x: x.strip() if type(x) is str else x
  return dataset.applymap(trim)

## CountVectorizer Example
document = ["This is Import Data's YouTube channel",
            "Data Science is my passion and it is fun",
            "Please subscribe to my channel"]
# create the transform
vectorizer = CountVectorizer()

# tokenize and make the document into a matrix
doc_term_matrix = vectorizer.fit_transform(document)

pd.DataFrame(doc_term_matrix.toarray(),columns = vectorizer.get_feature_names())

## Kaggle Twitter Data Cleaning
#install tweet-preprocessor to clean tweets
!pip install tweet-preprocessor

import re
#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

import preprocessor as p
	# function to find the mean when some have ranges and others don't
	def split_mean(x):
	# split before and after the hyphen (-)
	split_num = x.split("-")
	if len(split_num) == 2:
	return (float(split_num[0])+float(split_num[1]))/2
	# those who aren't in the range
	else:
	return float(x)
	document = ["This is Import Data's YouTube channel",
	"Data Science is my passion and it is fun",
	"Please subscribe to my channel"]
	# create the transform
	vectorizer = CountVectorizer()

	# tokenize and make the document into a matrix
	doc_term_matrix = vectorizer.fit_transform(document)

	pd.DataFrame(doc_term_matrix.toarray(),columns = vectorizer.get_feature_names())
	#install tweet-preprocessor to clean tweets
	!pip install tweet-preprocessor

	import re
	#set up punctuations we want to be replaced
	REPLACE_NO_SPACE = re.compile("(\.)\|(\;)\|(\:)\|(\!)\|(\')\|(\?)\|(\,)\|(\")\|(\\|)\|(\()\|(\))\|(\[)\|(\])\|(\%)\|(\$)\|(\>)\|(\<)\|(\{)\|(\})")
	REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)\|(-)\|(/)\|(:).")

	import preprocessor as p