Created
July 19, 2019 14:32
-
-
Save vansika/71a640e209cdcb0d99f974a4e4aee31c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for utils.py | |
def get_listens(y, m1, m2): | |
""" Loads all the listens listened to in a given time window from HDFS. | |
Args: | |
y (int): Year to load parquets. | |
m1 (int): Load parquets from month m1. | |
m2 (int): Load parquets till month m2. | |
Returns: | |
df (dataframe): Dataframe with columns as: | |
[ | |
'artist_mbids', 'artist_msid', 'artist_name', 'listened_at', 'recording_mbid' | |
'recording_msid', 'release_mbid', 'release_msid', 'release_name', 'tags', | |
'track_name', 'user_name' | |
] | |
""" | |
df = None | |
for m in range(m1, m2): | |
try: | |
month = read_files_from_HDFS('{}/data/listenbrainz/{}/{}.parquet'.format(config.HDFS_CLUSTER_URI, y, m)) | |
df = df.union(month) if df else month | |
except AnalysisException: | |
continue | |
except AttributeError: | |
logging.info('Aborting...') | |
raise | |
return df | |
# for candidate_sets.py | |
def get_listens(): | |
# under the assumption that config.RECOMMENDATION_GENERATION_WINDOW will always be | |
# between 0 and 28 (considering leap year) | |
rec_df = None | |
t = datetime.utcnow() | |
d = t + relativedelta(days=-config.RECOMMENDATION_GENERATION_WINDOW) | |
if d.year != t.year: | |
df = utils.get_listens(d.year, 12, 13) | |
rec_df = df | |
df = utils.get_listens(t.year, 1, 2) | |
rec_df = rec_df.union(df) | |
else: | |
rec_df = utils.get_listens(d.year, d.month, t.month + 1) | |
return rec_df | |
# for create_dataframes.py | |
def training_data_window(): | |
""" Prepare dataframe of listens of X months where X is a config value. | |
Returns: | |
training_df (dataframe): Columns can de depicted as: | |
[ | |
artist_mbids, artist_msid, artist_name, listened_at, recording_mbid, | |
recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name | |
] | |
Note: Under the assumption that config.TRAIN_MODEL_WINDOW will always indicate months. | |
""" | |
training_df = None | |
m = config.TRAIN_MODEL_WINDOW | |
while m > 0: | |
d = adjusted_date(-m) | |
if d.month + m > 12: | |
df = utils.get_listens(d.year, d.month, 13) | |
training_df = training_df.union(df) if training_df else df | |
m -= (13 - d.month) | |
else: | |
df = utils.get_listens(d.year, d.month, d.month + m) | |
training_df = training_df.union(df) if training_df else df | |
m -= (m - 1 + d.month) | |
return training_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment