wandabwa2004/cleanupfix.py

## cleanupfix.py
df.drop('hotel name', axis=1, inplace=True)
df.drop('review title', axis=1, inplace=True)
df.drop('helpful vote', axis=1, inplace=True)
df.drop('user name', axis=1, inplace=True)
df['rating'] = df['rating']/10
df = df[~df['review_body'].isnull()]

# A little bit of data clean up to get commas and ASCII characters out
def preprocess(review_body):
    review_body = review_body.str.replace("(<br/>)", "")
    review_body = review_body.str.replace('(<a).*(>).*(</a>)', '')
    review_body = review_body.str.replace('(&amp)', '')
    review_body = review_body.str.replace('(&gt)', '')
    review_body = review_body.str.replace('(&lt)', '')
    review_body = review_body.str.replace('(\xa0)', ' ')
    return review_body

df['review_body'] = preprocess(df['review_body'])

df['polarity'] = df['review_body'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['review_body'].astype(str).apply(len)
df['word_count'] = df['review_body'].apply(lambda x: len(str(x).split()))
	df.drop('hotel name', axis=1, inplace=True)
	df.drop('review title', axis=1, inplace=True)
	df.drop('helpful vote', axis=1, inplace=True)
	df.drop('user name', axis=1, inplace=True)
	df['rating'] = df['rating']/10
	df = df[~df['review_body'].isnull()]

	# A little bit of data clean up to get commas and ASCII characters out
	def preprocess(review_body):
	review_body = review_body.str.replace("(<br/>)", "")
	review_body = review_body.str.replace('(<a).(>).(</a>)', '')
	review_body = review_body.str.replace('(&amp)', '')
	review_body = review_body.str.replace('(&gt)', '')
	review_body = review_body.str.replace('(&lt)', '')
	review_body = review_body.str.replace('(\xa0)', ' ')
	return review_body

	df['review_body'] = preprocess(df['review_body'])

	df['polarity'] = df['review_body'].map(lambda text: TextBlob(text).sentiment.polarity)
	df['review_len'] = df['review_body'].astype(str).apply(len)
	df['word_count'] = df['review_body'].apply(lambda x: len(str(x).split()))