Skip to content

Instantly share code, notes, and snippets.

@vansika
Created July 19, 2019 14:32
Show Gist options
  • Save vansika/71a640e209cdcb0d99f974a4e4aee31c to your computer and use it in GitHub Desktop.
Save vansika/71a640e209cdcb0d99f974a4e4aee31c to your computer and use it in GitHub Desktop.
# for utils.py
def get_listens(y, m1, m2):
""" Loads all the listens listened to in a given time window from HDFS.
Args:
y (int): Year to load parquets.
m1 (int): Load parquets from month m1.
m2 (int): Load parquets till month m2.
Returns:
df (dataframe): Dataframe with columns as:
[
'artist_mbids', 'artist_msid', 'artist_name', 'listened_at', 'recording_mbid'
'recording_msid', 'release_mbid', 'release_msid', 'release_name', 'tags',
'track_name', 'user_name'
]
"""
df = None
for m in range(m1, m2):
try:
month = read_files_from_HDFS('{}/data/listenbrainz/{}/{}.parquet'.format(config.HDFS_CLUSTER_URI, y, m))
df = df.union(month) if df else month
except AnalysisException:
continue
except AttributeError:
logging.info('Aborting...')
raise
return df
# for candidate_sets.py
def get_listens():
# under the assumption that config.RECOMMENDATION_GENERATION_WINDOW will always be
# between 0 and 28 (considering leap year)
rec_df = None
t = datetime.utcnow()
d = t + relativedelta(days=-config.RECOMMENDATION_GENERATION_WINDOW)
if d.year != t.year:
df = utils.get_listens(d.year, 12, 13)
rec_df = df
df = utils.get_listens(t.year, 1, 2)
rec_df = rec_df.union(df)
else:
rec_df = utils.get_listens(d.year, d.month, t.month + 1)
return rec_df
# for create_dataframes.py
def training_data_window():
""" Prepare dataframe of listens of X months where X is a config value.
Returns:
training_df (dataframe): Columns can de depicted as:
[
artist_mbids, artist_msid, artist_name, listened_at, recording_mbid,
recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name
]
Note: Under the assumption that config.TRAIN_MODEL_WINDOW will always indicate months.
"""
training_df = None
m = config.TRAIN_MODEL_WINDOW
while m > 0:
d = adjusted_date(-m)
if d.month + m > 12:
df = utils.get_listens(d.year, d.month, 13)
training_df = training_df.union(df) if training_df else df
m -= (13 - d.month)
else:
df = utils.get_listens(d.year, d.month, d.month + m)
training_df = training_df.union(df) if training_df else df
m -= (m - 1 + d.month)
return training_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment