Skip to content

Instantly share code, notes, and snippets.

@csiebler
Last active July 3, 2024 09:30
Show Gist options
  • Save csiebler/7542631f9b9b837b1e1c654e56626d56 to your computer and use it in GitHub Desktop.
Save csiebler/7542631f9b9b837b1e1c654e56626d56 to your computer and use it in GitHub Desktop.
APIM Golden Template for Azure OpenAI Load Balance and Usage Tracking

Load balancing multiple AOAI resources and tracking usage for one or more use cases

Decision points

  • Which PTU deployments (models, regions)
  • Which PAYGO deployments (models, regions)
  • Token tracking level (use case, teams, etc.)

Load Balancing Strategy

PAYGO only

  • Setup of priority groups: PAYGO (highest priority in region A), then PAYGO (lower priorities in regions B, C, D, ...)

Example:

  • Priority 1: PAYGO in region A, PAYGO in region A
  • Priority 2: PAYGO in region B, PAYGO in region B

PTU + PAYGO

  • Setup of priority groups: PTU (highest priority), then PAYGO (lower priorities, potentially multiple regions)

Example:

  • Priority 1: PTU in region A
  • Priority 2: PAYGO in region A
  • Priority 3: PAYGO in region B

Seperating low and high-priority use cases

What is the need to differentiate between latency-sensitivity between use cases?

All use cases have equal priority

Typically a good choice when all use cases need to be fast/are equally important.

** Solution:**

  • One load balancing strategy (PTU+PAYGO)
  • Monitor PTU utilization/number of 429 error and scale number of PTUs up/down

Some use cases have higher priority

Typically useful when there are times when PTUs are underutilized and low-priority could re-use the PTUs to save cost.

Solution 1:

  • Two load balancing strategies (PTU+PAYGO)
  • Low-priority requests go a different endpoint route in APIM and only hit PTU during specific times of the day/days of the week

Solution 2:

  • Two load balancing strategies (PTU+PAYGO), requires two PTU deployments (e.g. 2x 50 PTUs for gpt-4o)
  • High-priority requests - Load balancing strategy --> implement via different API route
    • Priority 1: PTU-1 in region A, PTU-2 in region A
    • Priority 2: PAYGO in region A
    • Priority 3: PAYGO in region B
  • Low-priority requests - Load balancing strategy --> implement via different API route
    • Priority 1: PTU-2 in region A
    • Priority 2: PAYGO in region A
    • Priority 3: PAYGO in region B
  • This ensures that 50% of the capacity is reserved for the high-priority use cases only
<policies>
<inbound>
<base />
<!-- Getting the main variable where we keep the list of backends -->
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<!-- If we can't find the variable, initialize it -->
<choose>
<when condition="@(context.Variables.ContainsKey("listBackends") == false)">
<set-variable name="listBackends" value="@{
// -------------------------------------------------
// ------- Explanation of backend properties -------
// -------------------------------------------------
// "url": Your backend url
// "priority": Lower value means higher priority over other backends.
// If you have more one or more Priority 1 backends, they will always be used instead
// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response
JArray backends = new JArray();
backends.Add(new JObject()
{
{ "url", "https://azure-openai-ptu.openai.azure.com/" },
{ "priority", 1},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue },
{ "hasTimeRestriction", true }
});
backends.Add(new JObject()
{
{ "url", "https://azure-openai-paygo1.openai.azure.com/" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
});
backends.Add(new JObject()
{
{ "url", "https://azure-openai-paygo2.openai.azure.com/" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
});
return backends;
}" />
<!-- And store the variable into cache again -->
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
</when>
</choose>
<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" />
<set-header name="Authorization" exists-action="override">
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
</set-header>
<set-variable name="backendIndex" value="-1" />
<set-variable name="remainingBackends" value="1" />
<!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
<azure-openai-emit-token-metric>
<dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
</azure-openai-emit-token-metric>
</inbound>
<backend>
<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
{
backend["isThrottling"] = false;
backend["retryAfter"] = DateTime.MinValue;
}
}
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<!-- This is the main logic to pick the backend to be used -->
<set-variable name="backendIndex" value="@{
// define during which times we want the PTU to be accessible
bool IsWithinAllowedHours()
{
TimeZoneInfo timeZoneInfo = TimeZoneInfo.FindSystemTimeZoneById("W. Europe Standard Time");
DateTime currentTime = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, timeZoneInfo);
TimeSpan startTime = new TimeSpan(8, 0, 0);
TimeSpan endTime = new TimeSpan(22, 0, 0);
return !(currentTime.TimeOfDay >= startTime && currentTime.TimeOfDay <= endTime);
}
JArray backends = (JArray)context.Variables["listBackends"];
bool isWithinAllowedHours = IsWithinAllowedHours();
int selectedPriority = Int32.MaxValue;
List<int> availableBackends = new List<int>();
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
{
int backendPriority = backend.Value<int>("priority");
bool hasTimeRestriction = backend.Value<bool?>("hasTimeRestriction") ?? false;
// Check the time condition for the backend with time restriction
if (hasTimeRestriction && !isWithinAllowedHours)
{
continue; // Skip this backend if it's restricted by time and we're not within the allowed hours
}
if (backendPriority < selectedPriority)
{
selectedPriority = backendPriority;
availableBackends.Clear();
availableBackends.Add(i);
}
else if (backendPriority == selectedPriority)
{
availableBackends.Add(i);
}
}
}
if (availableBackends.Count == 1)
{
return availableBackends[0];
}
if (availableBackends.Count > 0)
{
//Returns a random backend from the list if we have more than one available with the same priority
return availableBackends[new Random().Next(0, availableBackends.Count)];
}
else
{
//If there are no available backends, the request will be sent to the first one
return 0;
}
}" />
<set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
<set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
<forward-request buffer-request-body="true" />
<choose>
<!-- In case we got 429 or 5xx from a backend, update the list with its status -->
<when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));
if (retryAfter == -1)
{
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
}
if (retryAfter == -1)
{
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
}
JObject backend = (JObject)backends[currentBackendIndex];
backend["isThrottling"] = true;
backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<set-variable name="remainingBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int remainingBackends = 0;
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
{
remainingBackends++;
}
}
return remainingBackends;
}" />
</when>
</choose>
</retry>
</backend>
<outbound>
<base />
<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
<set-header name="x-openai-backendurl" exists-action="override">
<value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
</set-header>
</outbound>
<on-error>
<base />
</on-error>
</policies>
<policies>
<inbound>
<base />
<!-- Getting the main variable where we keep the list of backends -->
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<!-- If we can't find the variable, initialize it -->
<choose>
<when condition="@(context.Variables.ContainsKey("listBackends") == false)">
<set-variable name="listBackends" value="@{
// -------------------------------------------------
// ------- Explanation of backend properties -------
// -------------------------------------------------
// "url": Your backend url
// "priority": Lower value means higher priority over other backends.
// If you have more one or more Priority 1 backends, they will always be used instead
// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response
JArray backends = new JArray();
backends.Add(new JObject()
{
{ "url", "https://azure-openai-ptu.openai.azure.com/" },
{ "priority", 1},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
});
backends.Add(new JObject()
{
{ "url", "https://azure-openai-paygo1.openai.azure.com/" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
});
backends.Add(new JObject()
{
{ "url", "https://azure-openai-paygo2.openai.azure.com/" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
});
return backends;
}" />
<!-- And store the variable into cache again -->
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
</when>
</choose>
<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="msi-access-token" ignore-error="false" />
<set-header name="Authorization" exists-action="override">
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
</set-header>
<set-variable name="backendIndex" value="-1" />
<set-variable name="remainingBackends" value="1" />
<!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
<azure-openai-emit-token-metric>
<dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
</azure-openai-emit-token-metric>
</inbound>
<backend>
<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
{
backend["isThrottling"] = false;
backend["retryAfter"] = DateTime.MinValue;
}
}
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<!-- This is the main logic to pick the backend to be used -->
<set-variable name="backendIndex" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int selectedPriority = Int32.MaxValue;
List<int> availableBackends = new List<int>();
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
{
int backendPriority = backend.Value<int>("priority");
if (backendPriority < selectedPriority)
{
selectedPriority = backendPriority;
availableBackends.Clear();
availableBackends.Add(i);
}
else if (backendPriority == selectedPriority)
{
availableBackends.Add(i);
}
}
}
if (availableBackends.Count == 1)
{
return availableBackends[0];
}
if (availableBackends.Count > 0)
{
//Returns a random backend from the list if we have more than one available with the same priority
return availableBackends[new Random().Next(0, availableBackends.Count)];
}
else
{
//If there are no available backends, the request will be sent to the first one
return 0;
}
}" />
<set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
<set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
<forward-request buffer-request-body="true" />
<choose>
<!-- In case we got 429 or 5xx from a backend, update the list with its status -->
<when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));
if (retryAfter == -1)
{
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
}
if (retryAfter == -1)
{
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
}
JObject backend = (JObject)backends[currentBackendIndex];
backend["isThrottling"] = true;
backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<set-variable name="remainingBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int remainingBackends = 0;
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
{
remainingBackends++;
}
}
return remainingBackends;
}" />
</when>
</choose>
</retry>
</backend>
<outbound>
<base />
<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
<set-header name="x-openai-backendurl" exists-action="override">
<value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
</set-header>
</outbound>
<on-error>
<base />
</on-error>
</policies>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment