Skip to content

Instantly share code, notes, and snippets.

@fortran01
Last active January 8, 2024 19:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fortran01/35637959b7abbf47d7ff2bb7aae44a56 to your computer and use it in GitHub Desktop.
Save fortran01/35637959b7abbf47d7ff2bb7aae44a56 to your computer and use it in GitHub Desktop.
Fetch machine-stats time-series data from the Tidal platform
import json
import os
import pandas as pd
testing_flag = False # Set to True to only fetch data for the first two servers with machine stats
def list_servers_without_machine_info(df_servers):
df_servers_without_machine_info = df_servers[
df_servers['ram_allocated_gb'].isna()
| df_servers['storage_allocated_gb'].isna()
| df_servers['storage_used_gb'].isna()
| df_servers['cpu_count'].isna()
| df_servers['ram_used_gb'].isna()
| df_servers['cpu_name'].isna()
]
# Filter out servers without apps
df_servers_without_machine_info = df_servers_without_machine_info[
df_servers_without_machine_info['apps'].apply(lambda x: list(x) != [])
]
print("List of all fields available in the dataframe:")
print(df_servers_without_machine_info.columns.tolist())
# Narrow down the columns
df_servers_without_machine_info = df_servers_without_machine_info[['id',
'host_name',
'description',
'ram_allocated_gb',
'storage_allocated_gb',
'storage_used_gb',
'cpu_count',
'ram_used_gb',
'virtual',
'cpu_name',
'operating_system',
'apps']]
df_servers_without_machine_info.to_csv('servers_without_machine_info.csv', index=False)
def clean_servers_without_measurements(df_no_measurements):
df_no_measurements = df_no_measurements[df_no_measurements['apps'].apply(lambda x: list(x) != [])]
df_no_measurements['apps'] = df_no_measurements['apps'].apply(lambda x: [app['name'] for app in x])
print("List of all fields available in the dataframe:")
print(df_no_measurements.columns.tolist())
df_no_measurements = df_no_measurements[['id', 'host_name', 'description', 'ram_allocated_gb', 'storage_allocated_gb', 'storage_used_gb', 'cpu_count', 'ram_used_gb', 'virtual', 'cpu_name', 'operating_system', 'apps']]
df_no_measurements.to_csv('servers_no_measurements_but_with_apps.csv', index=False)
# Get the list of servers from the Tidal API or from the feather file
user_input = input("Do you want to get the latest servers list or get it from servers.feather? (Enter 'l for latest' or 'f for feather'): ")
if user_input.lower() == 'l':
# Execute `tidal export servers > servers.json`
# Then parse servers.json
# Server ID is in the `id` field and server name is in the `host_name` field
out = os.popen('tidal export servers').read()
servers = json.loads(out)
# Store the latest servers list in a feather file
df_servers = pd.DataFrame(servers)
df_servers.to_feather('servers.feather')
elif user_input.lower() == 'f':
# Load the servers list from the feather file
servers = pd.read_feather('servers.feather').to_dict('records')
else:
print("Invalid input. Please enter either 'latest' or 'feather'.")
df_servers = pd.DataFrame(servers)
df_servers.to_csv('servers.csv', index=False)
list_servers_without_machine_info(df_servers)
# Define an empty list to store the dataframes for each server
dataframes = []
# Define an empty list to store the servers without measurements
servers_without_measurements = []
user_input = input("Do you want to get the latest machine stats or get it from output.feather? (Enter 'l for latest' or 'f for feather'): ")
if user_input.lower() == 'l':
# Iterate over each server ID
iteration_count = 0
for s in servers:
# Call the Tidal API to get the time-series data for the current server ID
# Replace {server_id} with the actual server ID
# The --csv-output flag will output the data in CSV format
command = f'tidal request -X GET "/api/v1/servers/{s["id"]}/measurements?filters[field_name]=cpu_utilization_timeseries" --csv-output 2>&1'
output = os.popen(command).read()
# Create a list of the CSV output lines
lines = output.strip().split('\n')
# If any of the lines contain the string '404 [No measurements found', then skip the server
if any('404 [No measurements found' in line for line in lines):
print(f'No measurements found for server {s["host_name"]}')
servers_without_measurements.append(s)
continue
else:
# Filter out lines that are not in CSV format
csv_lines = [line for line in lines if ',' in line]
# If there are no CSV lines, then skip the server
if not csv_lines:
print(f'No CSV measurements found for server {s["host_name"]}')
servers_without_measurements.append(s)
continue
# Print a few lines of the CSV output
print('\n'.join(csv_lines[:]))
# Sample output
# created_at,external_timestamp,field_name,id,measurable_id,measurable_type,updated_at,value
# 2023-11-14T21:13:01.838Z,2023-11-14T21:12:10.000Z,cpu_utilization_timeseries,167281,400,Server,2023-11-14T21:13:01.838Z,0.4372267332916957
# 2023-11-14T21:22:59.992Z,2023-11-14T21:22:08.000Z,cpu_utilization_timeseries,167429,400,Server,2023-11-14T21:22:59.992Z,0.2498438475952547
# 2023-11-15T01:23:01.992Z,2023-11-15T01:22:09.000Z,cpu_utilization_timeseries,173401,400,Server,2023-11-15T01:23:01.992Z,0.3121098626716612
# 2023-11-14T21:32:59.220Z,2023-11-14T21:32:08.000Z,cpu_utilization_timeseries,167687,400,Server,2023-11-14T21:32:59.220Z,0.6246096189881367
print(f'I see CSV measurements for server {s["host_name"]} with id {s["id"]}')
# Parse the header in the first row
header = csv_lines[0].split(',')
# Create a list of dictionaries from the remaining lines
rows = [dict(zip(header, line.split(','))) for line in csv_lines[1:]]
# Create a dataframe from the list of dictionaries
df = pd.DataFrame(rows)
# Add a column for the server name
df['server_name'] = s['host_name']
# Reorder the columns
df = df[['server_name', 'measurable_id', 'id', 'external_timestamp', 'field_name', 'value', 'created_at', 'measurable_type', 'updated_at']]
# Append the dataframe to the list
dataframes.append(df)
iteration_count += 1
if testing_flag and iteration_count == 2: # Only break if testing_flag is True
break
# Concatenate the dataframes into a single dataframe
result = pd.concat(dataframes)
# Save the dataframe to a feather file
result.reset_index().to_feather('output.feather')
# Convert servers_without_measurements to a DataFrame and save to a feather file
df_no_measurements = pd.DataFrame(servers_without_measurements)
df_no_measurements.reset_index().to_feather('df_no_measurements.feather')
elif user_input.lower() == 'f':
# Load the dataframe from the feather file
result = pd.read_feather('output.feather')
# Load df_no_measurements from the feather file
df_no_measurements = pd.read_feather('df_no_measurements.feather')
else:
print("Invalid input. Please enter either 'latest' or 'feather'.")
# Write the dataframe to a CSV file
result.to_csv('output.csv', index=False)
# Ensure 'value' is numeric before aggregation
result['value'] = pd.to_numeric(result['value'], errors='coerce')
# Aggregate the 'value' field by server by finding the average, standard deviation, min and max
result = result.groupby('server_name')['value'].agg(['mean', 'std', 'min', 'max'])
# Write the aggregated dataframe to a CSV file
result.reset_index().to_csv('output_aggregated.csv', index=False)
clean_servers_without_measurements(df_no_measurements)
print("CSV files generated:")
print("1. output.csv: This file contains the raw data fetched from the servers.")
print("2. output_aggregated.csv: This file contains the aggregated data (mean, std, min, max) for each server.")
print("3. servers_no_measurements_but_with_apps.csv: This file contains the details of servers without measurements but with apps.")
print("4. servers_without_machine_info.csv: This file contains the details of servers without machine info.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment