Skip to content

Instantly share code, notes, and snippets.

@wandabwa2004
Created July 22, 2019 01:49
Show Gist options
  • Save wandabwa2004/d6e9ebcdbd6913452826ae9abc8ee1f3 to your computer and use it in GitHub Desktop.
Save wandabwa2004/d6e9ebcdbd6913452826ae9abc8ee1f3 to your computer and use it in GitHub Desktop.
Cleanup
df.drop('hotel name', axis=1, inplace=True)
df.drop('review title', axis=1, inplace=True)
df.drop('helpful vote', axis=1, inplace=True)
df.drop('user name', axis=1, inplace=True)
df['rating'] = df['rating']/10
df = df[~df['review_body'].isnull()]
# A little bit of data clean up to get commas and ASCII characters out
def preprocess(review_body):
review_body = review_body.str.replace("(<br/>)", "")
review_body = review_body.str.replace('(<a).*(>).*(</a>)', '')
review_body = review_body.str.replace('(&amp)', '')
review_body = review_body.str.replace('(&gt)', '')
review_body = review_body.str.replace('(&lt)', '')
review_body = review_body.str.replace('(\xa0)', ' ')
return review_body
df['review_body'] = preprocess(df['review_body'])
df['polarity'] = df['review_body'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['review_body'].astype(str).apply(len)
df['word_count'] = df['review_body'].apply(lambda x: len(str(x).split()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment