Skip to content

Instantly share code, notes, and snippets.

@dhirschfeld
Created January 24, 2022 07:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dhirschfeld/7f51a825ef2fc6e026e93c46dfef3270 to your computer and use it in GitHub Desktop.
Save dhirschfeld/7f51a825ef2fc6e026e93c46dfef3270 to your computer and use it in GitHub Desktop.
Clearly structuring pandas transformations
def tweak_twitter(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy(deep=False)
normalised_columns = {col: col.replace(' ', '_') for col in df.columns}
df = df.rename(columns=normalised_columns)
# filter uninteresting columns
excluded_columns = [
'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone',
]
excluded_columns += [col for col in df.columns if 'promoted' in col]
df = df.drop(columns=excluded_columns)
# fix types
type_map = dict(
impressions=np.uint32,
engagements=np.uint16,
replies=np.uint8,
hashtag_clicks=np.uint8,
follows=np.uint8,
retweets=np.uint16,
likes=np.uint16,
user_profile_clicks=np.uint16,
url_clicks=np.uint16,
detail_expands=np.uint16,
media_views=np.uint16,
media_engagements=np.uint16,
Tweet_text='category',
)
df = df.astype(type_map)
df['time'] = df['time'].dt.tz_convert('America/Denver')
# assign feature columns
feature_columns = dict(
Tweet_permalink='https://twitter.com/__mharrison__/status/',
is_reply=df.Tweet_text.str.startswith('@'),
length=df.Tweet_text.str.len(),
num_words=df.Tweet_text.str.split().apply(len),
is_unicode=(
df.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii')
!= df.Tweet_text
),
hour=df.time.dt.hour,
dom=df.time.dt.day,
dow=df.time.dt.dayofweek,
at_tweet=df.Tweet_text.str.contains('@'),
has_newlines=df.Tweet_text.str.contains('\n'),
num_lines=df.Tweet_text.str.count('\n'),
num_mentions=df.Tweet_text.str.count('@'),
has_hashtag=df.Tweet_text.str.count('#'),
)
df = df.assign(**feature_columns)
return df.reset_index()
@dhirschfeld
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment