# Import the relevant packages | |
import numpy as np | |
import re | |
# Create 6 new dummy variables which indicate whether one of the words associated with a resolution is present in the tweet. | |
twitter_df['Physical Health'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(weight|fit|exercise|gym|muscle|health|water|smoking|alcohol|drinking|walk|run|swim)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Learning and Career'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(business|job|career|professional|study|learn|develop|advance|grades|school|university| read|study|skill|education)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Mental Wellbeing'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(positive|enjoy|happy|happiness|stress|depress|anxi|organised|organized|hobb|fun|psychologist|psychiatrist|sleep|meditate)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Finances'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(save|saving|debt|credit|money|invest|wast|finance|frugal|\$)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Relationships'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(relationship|friend|boyfriend|girlfriend|fiance|husband|wife|engaged|wedding|married|pregnant|child|kid|family|parent|father|dad|mother|mom|mum|brother|sister|dog|cat)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
twitter_df['Travel and Holidays'] = np.where(twitter_df['Tweet'].str.contains('(?:^|\W)(travel|trip|holiday|vacation|country|foreign|overseas|abroad)(?:$|\W)', | |
flags = re.IGNORECASE), 1, 0) | |
# Find cases where tweets fall into multiple categories and delete them | |
twitter_df['Number of resolutions'] = twitter_df[col_list].sum(axis=1) | |
twitter_df = twitter_df.drop(twitter_df[twitter_df['Number of resolutions'] > 1].index) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment