DFoly/part6_Twitter.py

## part6_Twitter.py
def clean_tweets(self, df):

		"""
		Takes raw tweets and cleans them
		so we can carry out analysis
		remove stopwords, punctuation,
		lower case, html, emoticons.
		This will be done using Regex
		? means option so colou?r matches
		both color and colour.
		"""

		# text preprocessing
		stopword_list = stopwords.words('english')
		#ps = PorterStemmer()
		wordnet_lemmatizer = WordNetLemmatizer()
		df["clean_tweets"] = None
		df['len'] = None
		for i in range(0,len(df['tweet'])):
			# get rid of anything that isnt a letter

			exclusion_list = ['[^a-zA-Z]','rt', 'http', 'co', 'RT']
			exclusions = '|'.join(exclusion_list)
			text = re.sub(exclusions, ' ' , df['tweet'][i])
			text = text.lower()
			words = text.split()
			words = [wordnet_lemmatizer.lemmatize(word) for word in words if not word in stopword_list]
			 # only use stem of word
			#words = [ps.stem(word) for word in words]
			df['clean_tweets'][i] = ' '.join(words)


		# Create column with data length
		df['len'] = np.array([len(tweet) for tweet in data["clean_tweets"]])


		return df
	def clean_tweets(self, df):

	"""
	Takes raw tweets and cleans them
	so we can carry out analysis
	remove stopwords, punctuation,
	lower case, html, emoticons.
	This will be done using Regex
	? means option so colou?r matches
	both color and colour.
	"""

	# text preprocessing
	stopword_list = stopwords.words('english')
	#ps = PorterStemmer()
	wordnet_lemmatizer = WordNetLemmatizer()
	df["clean_tweets"] = None
	df['len'] = None
	for i in range(0,len(df['tweet'])):
	# get rid of anything that isnt a letter

	exclusion_list = ['[^a-zA-Z]','rt', 'http', 'co', 'RT']
	exclusions = '\|'.join(exclusion_list)
	text = re.sub(exclusions, ' ' , df['tweet'][i])
	text = text.lower()
	words = text.split()
	words = [wordnet_lemmatizer.lemmatize(word) for word in words if not word in stopword_list]
	# only use stem of word
	#words = [ps.stem(word) for word in words]
	df['clean_tweets'][i] = ' '.join(words)


	# Create column with data length
	df['len'] = np.array([len(tweet) for tweet in data["clean_tweets"]])



	return df