Skip to content

Instantly share code, notes, and snippets.

@pgao
Last active February 13, 2024 23:57
Show Gist options
  • Save pgao/fa115eaf211aafe114909064ac79aa31 to your computer and use it in GitHub Desktop.
Save pgao/fa115eaf211aafe114909064ac79aa31 to your computer and use it in GitHub Desktop.
a script to upload the yelp reviews dataset (https://huggingface.co/datasets/yelp_review_full) to tidepool
#!/usr/bin/env python3
import datetime
import pandas as pd
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from time import sleep
from uuid import uuid4
#################################
# Set up URLs + request headers #
#################################
requests_with_retry = requests.Session()
requests_with_retry.mount("https://", HTTPAdapter(max_retries=Retry(total=3, backoff_factor=1, raise_on_status=False, status_forcelist=[500, 502, 503, 504])))
DATA_FILEPATH = 'train-00000-of-00001.parquet'
# submission and validation endpoints - reference https://docs.tidepool.so/reference/
API_BASE = 'https://shoreline.tidepool.so/api/v1/events'
# API keys are project specific - reference https://docs.tidepool.so/docs/manage-api-keys
headers = {
"X-Tidepool-Api-Key": os.environ.get('TIDEPOOL_API_KEY'),
"Content-Type": "application/json",
}
############################################
# Load in data, batch it up, and format it #
############################################
# A little utility to batch data, since the API takes up to 100 events per batch
def make_batches(all_entries, n=100):
return [all_entries[i : i + n] for i in range(0, len(all_entries), n)]
df = pd.read_parquet(DATA_FILEPATH)
formatted_events = []
for i in range(df.shape[0]):
# Set all required fields from the entry
event = {}
event['text'] = df['text'][i]
# fake iso timestamp
event['timestamp'] = datetime.datetime.now().isoformat()
event['properties'] = {
'label': int(df['label'][i])
}
# fake out required fields
event['id'] = str(uuid4())
event['session_id'] = str(uuid4())
event['user_id'] = str(uuid4())
event['event_name'] = 'USER_MESSAGE'
formatted_events.append(event)
batched_events = make_batches(formatted_events)
######################
# Make the requests! #
######################
# Validate a single batch of data before going through the rest.
validate_resp = requests.post(f'{API_BASE}/validate', json=batched_events[0], headers=headers)
# validate_resp.raise_for_status()
try:
validate_resp.raise_for_status()
except requests.exceptions.HTTPError as e:
print(e.args[0])
except requests.exceptions.RequestException as e:
cause = e.args[0]
print(str(cause.args[0]))
# Submit all the batches!
for i, batch in enumerate(batched_events):
track_response = requests_with_retry.post(f'{API_BASE}/track', json=batch, headers=headers)
if not track_response.ok:
print(track_response.text)
track_response.raise_for_status()
print(f"Submitted batch {i + 1} of {len(batched_events)}")
sleep(0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment