Last active
January 8, 2024 19:07
-
-
Save fortran01/35637959b7abbf47d7ff2bb7aae44a56 to your computer and use it in GitHub Desktop.
Fetch machine-stats time-series data from the Tidal platform
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import pandas as pd | |
testing_flag = False # Set to True to only fetch data for the first two servers with machine stats | |
def list_servers_without_machine_info(df_servers): | |
df_servers_without_machine_info = df_servers[ | |
df_servers['ram_allocated_gb'].isna() | |
| df_servers['storage_allocated_gb'].isna() | |
| df_servers['storage_used_gb'].isna() | |
| df_servers['cpu_count'].isna() | |
| df_servers['ram_used_gb'].isna() | |
| df_servers['cpu_name'].isna() | |
] | |
# Filter out servers without apps | |
df_servers_without_machine_info = df_servers_without_machine_info[ | |
df_servers_without_machine_info['apps'].apply(lambda x: list(x) != []) | |
] | |
print("List of all fields available in the dataframe:") | |
print(df_servers_without_machine_info.columns.tolist()) | |
# Narrow down the columns | |
df_servers_without_machine_info = df_servers_without_machine_info[['id', | |
'host_name', | |
'description', | |
'ram_allocated_gb', | |
'storage_allocated_gb', | |
'storage_used_gb', | |
'cpu_count', | |
'ram_used_gb', | |
'virtual', | |
'cpu_name', | |
'operating_system', | |
'apps']] | |
df_servers_without_machine_info.to_csv('servers_without_machine_info.csv', index=False) | |
def clean_servers_without_measurements(df_no_measurements): | |
df_no_measurements = df_no_measurements[df_no_measurements['apps'].apply(lambda x: list(x) != [])] | |
df_no_measurements['apps'] = df_no_measurements['apps'].apply(lambda x: [app['name'] for app in x]) | |
print("List of all fields available in the dataframe:") | |
print(df_no_measurements.columns.tolist()) | |
df_no_measurements = df_no_measurements[['id', 'host_name', 'description', 'ram_allocated_gb', 'storage_allocated_gb', 'storage_used_gb', 'cpu_count', 'ram_used_gb', 'virtual', 'cpu_name', 'operating_system', 'apps']] | |
df_no_measurements.to_csv('servers_no_measurements_but_with_apps.csv', index=False) | |
# Get the list of servers from the Tidal API or from the feather file | |
user_input = input("Do you want to get the latest servers list or get it from servers.feather? (Enter 'l for latest' or 'f for feather'): ") | |
if user_input.lower() == 'l': | |
# Execute `tidal export servers > servers.json` | |
# Then parse servers.json | |
# Server ID is in the `id` field and server name is in the `host_name` field | |
out = os.popen('tidal export servers').read() | |
servers = json.loads(out) | |
# Store the latest servers list in a feather file | |
df_servers = pd.DataFrame(servers) | |
df_servers.to_feather('servers.feather') | |
elif user_input.lower() == 'f': | |
# Load the servers list from the feather file | |
servers = pd.read_feather('servers.feather').to_dict('records') | |
else: | |
print("Invalid input. Please enter either 'latest' or 'feather'.") | |
df_servers = pd.DataFrame(servers) | |
df_servers.to_csv('servers.csv', index=False) | |
list_servers_without_machine_info(df_servers) | |
# Define an empty list to store the dataframes for each server | |
dataframes = [] | |
# Define an empty list to store the servers without measurements | |
servers_without_measurements = [] | |
user_input = input("Do you want to get the latest machine stats or get it from output.feather? (Enter 'l for latest' or 'f for feather'): ") | |
if user_input.lower() == 'l': | |
# Iterate over each server ID | |
iteration_count = 0 | |
for s in servers: | |
# Call the Tidal API to get the time-series data for the current server ID | |
# Replace {server_id} with the actual server ID | |
# The --csv-output flag will output the data in CSV format | |
command = f'tidal request -X GET "/api/v1/servers/{s["id"]}/measurements?filters[field_name]=cpu_utilization_timeseries" --csv-output 2>&1' | |
output = os.popen(command).read() | |
# Create a list of the CSV output lines | |
lines = output.strip().split('\n') | |
# If any of the lines contain the string '404 [No measurements found', then skip the server | |
if any('404 [No measurements found' in line for line in lines): | |
print(f'No measurements found for server {s["host_name"]}') | |
servers_without_measurements.append(s) | |
continue | |
else: | |
# Filter out lines that are not in CSV format | |
csv_lines = [line for line in lines if ',' in line] | |
# If there are no CSV lines, then skip the server | |
if not csv_lines: | |
print(f'No CSV measurements found for server {s["host_name"]}') | |
servers_without_measurements.append(s) | |
continue | |
# Print a few lines of the CSV output | |
print('\n'.join(csv_lines[:])) | |
# Sample output | |
# created_at,external_timestamp,field_name,id,measurable_id,measurable_type,updated_at,value | |
# 2023-11-14T21:13:01.838Z,2023-11-14T21:12:10.000Z,cpu_utilization_timeseries,167281,400,Server,2023-11-14T21:13:01.838Z,0.4372267332916957 | |
# 2023-11-14T21:22:59.992Z,2023-11-14T21:22:08.000Z,cpu_utilization_timeseries,167429,400,Server,2023-11-14T21:22:59.992Z,0.2498438475952547 | |
# 2023-11-15T01:23:01.992Z,2023-11-15T01:22:09.000Z,cpu_utilization_timeseries,173401,400,Server,2023-11-15T01:23:01.992Z,0.3121098626716612 | |
# 2023-11-14T21:32:59.220Z,2023-11-14T21:32:08.000Z,cpu_utilization_timeseries,167687,400,Server,2023-11-14T21:32:59.220Z,0.6246096189881367 | |
print(f'I see CSV measurements for server {s["host_name"]} with id {s["id"]}') | |
# Parse the header in the first row | |
header = csv_lines[0].split(',') | |
# Create a list of dictionaries from the remaining lines | |
rows = [dict(zip(header, line.split(','))) for line in csv_lines[1:]] | |
# Create a dataframe from the list of dictionaries | |
df = pd.DataFrame(rows) | |
# Add a column for the server name | |
df['server_name'] = s['host_name'] | |
# Reorder the columns | |
df = df[['server_name', 'measurable_id', 'id', 'external_timestamp', 'field_name', 'value', 'created_at', 'measurable_type', 'updated_at']] | |
# Append the dataframe to the list | |
dataframes.append(df) | |
iteration_count += 1 | |
if testing_flag and iteration_count == 2: # Only break if testing_flag is True | |
break | |
# Concatenate the dataframes into a single dataframe | |
result = pd.concat(dataframes) | |
# Save the dataframe to a feather file | |
result.reset_index().to_feather('output.feather') | |
# Convert servers_without_measurements to a DataFrame and save to a feather file | |
df_no_measurements = pd.DataFrame(servers_without_measurements) | |
df_no_measurements.reset_index().to_feather('df_no_measurements.feather') | |
elif user_input.lower() == 'f': | |
# Load the dataframe from the feather file | |
result = pd.read_feather('output.feather') | |
# Load df_no_measurements from the feather file | |
df_no_measurements = pd.read_feather('df_no_measurements.feather') | |
else: | |
print("Invalid input. Please enter either 'latest' or 'feather'.") | |
# Write the dataframe to a CSV file | |
result.to_csv('output.csv', index=False) | |
# Ensure 'value' is numeric before aggregation | |
result['value'] = pd.to_numeric(result['value'], errors='coerce') | |
# Aggregate the 'value' field by server by finding the average, standard deviation, min and max | |
result = result.groupby('server_name')['value'].agg(['mean', 'std', 'min', 'max']) | |
# Write the aggregated dataframe to a CSV file | |
result.reset_index().to_csv('output_aggregated.csv', index=False) | |
clean_servers_without_measurements(df_no_measurements) | |
print("CSV files generated:") | |
print("1. output.csv: This file contains the raw data fetched from the servers.") | |
print("2. output_aggregated.csv: This file contains the aggregated data (mean, std, min, max) for each server.") | |
print("3. servers_no_measurements_but_with_apps.csv: This file contains the details of servers without measurements but with apps.") | |
print("4. servers_without_machine_info.csv: This file contains the details of servers without machine info.") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment