Skip to content

Instantly share code, notes, and snippets.

@t-redactyl
Created April 15, 2017 07:58
Show Gist options
  • Save t-redactyl/d6eea85dddf9d586dd47f35368a646b7 to your computer and use it in GitHub Desktop.
Save t-redactyl/d6eea85dddf9d586dd47f35368a646b7 to your computer and use it in GitHub Desktop.
# Import the relevant packages
import numpy as np
import re
# Create 6 new dummy variables which indicate whether one of the words associated with a resolution is present in the tweet.
twitter_df['Physical Health'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(weight|fit|exercise|gym|muscle|health|water|smoking|alcohol|drinking|walk|run|swim)(?:$|\W)',
flags = re.IGNORECASE), 1, 0)
twitter_df['Learning and Career'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(business|job|career|professional|study|learn|develop|advance|grades|school|university| read|study|skill|education)(?:$|\W)',
flags = re.IGNORECASE), 1, 0)
twitter_df['Mental Wellbeing'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(positive|enjoy|happy|happiness|stress|depress|anxi|organised|organized|hobb|fun|psychologist|psychiatrist|sleep|meditate)(?:$|\W)',
flags = re.IGNORECASE), 1, 0)
twitter_df['Finances'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(save|saving|debt|credit|money|invest|wast|finance|frugal|\$)(?:$|\W)',
flags = re.IGNORECASE), 1, 0)
twitter_df['Relationships'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(relationship|friend|boyfriend|girlfriend|fiance|husband|wife|engaged|wedding|married|pregnant|child|kid|family|parent|father|dad|mother|mom|mum|brother|sister|dog|cat)(?:$|\W)',
flags = re.IGNORECASE), 1, 0)
twitter_df['Travel and Holidays'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(travel|trip|holiday|vacation|country|foreign|overseas|abroad)(?:$|\W)',
flags = re.IGNORECASE), 1, 0)
# Find cases where tweets fall into multiple categories and delete them
twitter_df['Number of resolutions'] = twitter_df[col_list].sum(axis=1)
twitter_df = twitter_df.drop(twitter_df[twitter_df['Number of resolutions'] > 1].index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment