Created
April 15, 2017 07:58
-
-
Save t-redactyl/d6eea85dddf9d586dd47f35368a646b7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import the relevant packages | |
import numpy as np | |
import re | |
# Create 6 new dummy variables which indicate whether one of the words associated with a resolution is present in the tweet. | |
twitter_df['Physical Health'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(weight|fit|exercise|gym|muscle|health|water|smoking|alcohol|drinking|walk|run|swim)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Learning and Career'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(business|job|career|professional|study|learn|develop|advance|grades|school|university| read|study|skill|education)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Mental Wellbeing'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(positive|enjoy|happy|happiness|stress|depress|anxi|organised|organized|hobb|fun|psychologist|psychiatrist|sleep|meditate)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Finances'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(save|saving|debt|credit|money|invest|wast|finance|frugal|\$)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Relationships'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(relationship|friend|boyfriend|girlfriend|fiance|husband|wife|engaged|wedding|married|pregnant|child|kid|family|parent|father|dad|mother|mom|mum|brother|sister|dog|cat)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Travel and Holidays'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(travel|trip|holiday|vacation|country|foreign|overseas|abroad)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
# Find cases where tweets fall into multiple categories and delete them | |
twitter_df['Number of resolutions'] = twitter_df[col_list].sum(axis=1) | |
twitter_df = twitter_df.drop(twitter_df[twitter_df['Number of resolutions'] > 1].index) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment