Skip to content

Instantly share code, notes, and snippets.

@mynameisvinn
Last active July 1, 2024 17:18
Show Gist options
  • Save mynameisvinn/3945844f0a3df32e0bf13d25de51d328 to your computer and use it in GitHub Desktop.
Save mynameisvinn/3945844f0a3df32e0bf13d25de51d328 to your computer and use it in GitHub Desktop.
from glob import glob
import json
from tqdm import tqdm
# raw treatment -> gs://etsy-recsys-ml-dev-data-nxsn/user/vtang/updates-ranker-v1/metrics/prod_updates_2
prod_fp = glob("./prod_updates_2/*")
prod_updates = {} # key = notification_feed_id, value = notification_type of candidate position 0
for fp in tqdm(prod_fp):
with open(fp, "r") as f:
candidates = f.readlines()
for candidate in candidates:
candidate = json.loads(candidate)
# we want the top candidates for each notification_feed_id
if candidate['position'] == 0:
prod_updates[candidate['notification_feed_id']] = candidate['notification_type']
len(prod_updates)
# raw production -> gs://etsy-recsys-ml-dev-data-nxsn/user/vtang/updates-ranker-v1/metrics/updates-ranker-v1/
# dataflow logs -> https://console.cloud.google.com/dataflow/jobs/us-central1/2024-07-01_09_45_29-15640809537440376803;step=ParquetDataLoaderAndFilterAttributions;graphView=0?project=etsy-recsys-ml-dev&pageState=(%22dfTime%22:(%22l%22:%22dfJobMaxTime%22))
treatment_fp = glob("./updates-ranker-v2/updates-ranker-v1/*")
print(len(treatment_fp))
updates_ranker_v1 = {}
for fp in tqdm(treatment_fp):
with open(fp, "r") as f:
candidates = f.readlines()
for candidate in candidates:
candidate = json.loads(candidate)
# we want the top candidates for each notification_feed_id
if candidate['position'] == 0:
updates_ranker_v1[candidate['notification_feed_id']] = candidate['notification_type']
len(updates_ranker_v1)
# compare the top candidates
for k, v in updates_ranker_v1.items():
if k in prod_updates.keys():
is_same = v == prod_updates[k]
if not is_same:
print(k)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment