Skip to content

Instantly share code, notes, and snippets.

Last active June 28, 2024 11:17
Show Gist options
  • Save csiebler/3316f74b4e03c40b3e23d5a2180cf8b6 to your computer and use it in GitHub Desktop.
Save csiebler/3316f74b4e03c40b3e23d5a2180cf8b6 to your computer and use it in GitHub Desktop.
APIM policy for AOAI for version forwarding
<base />
<!-- Re-add the api-version query parameter, so the backends know which version we want -->
<set-query-parameter name="api-version" exists-action="override">
<!-- Getting the main variable where we keep the list of backends -->
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<!-- If we can't find the variable, initialize it -->
<when condition="@(context.Variables.ContainsKey("listBackends") == false)">
<set-variable name="listBackends" value="@{
// -------------------------------------------------
// ------- Explanation of backend properties -------
// -------------------------------------------------
// "url": Your backend url
// "priority": Lower value means higher priority over other backends.
// If you have more one or more Priority 1 backends, they will always be used instead
// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response
JArray backends = new JArray();
backends.Add(new JObject()
{ "url", "" },
{ "priority", 1},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
backends.Add(new JObject()
{ "url", "" },
{ "priority", 2},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
backends.Add(new JObject()
{ "url", "" },
{ "priority", 3},
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
return backends;
}" />
<!-- And store the variable into cache again -->
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<authentication-managed-identity resource="" output-token-variable-name="msi-access-token" ignore-error="false" />
<set-header name="Authorization" exists-action="override">
<value>@("Bearer " + (string)context.Variables["msi-access-token"])</value>
<set-variable name="backendIndex" value="-1" />
<set-variable name="remainingBackends" value="1" />
<!-- Emit token usage per APIM Subscription, requires Application Insight logging to be enabled for API -->
<dimension name="SubscriptionId" value="@(context.Subscription.Id)" />
<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && ((Int32)context.Variables["remainingBackends"]) > 0)" count="50" interval="0">
<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
for (int i = 0; i < backends.Count; i++)
JObject backend = (JObject)backends[i];
if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
backend["isThrottling"] = false;
backend["retryAfter"] = DateTime.MinValue;
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<!-- This is the main logic to pick the backend to be used -->
<set-variable name="backendIndex" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int selectedPriority = Int32.MaxValue;
List<int> availableBackends = new List<int>();
for (int i = 0; i < backends.Count; i++)
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
int backendPriority = backend.Value<int>("priority");
if (backendPriority < selectedPriority)
selectedPriority = backendPriority;
else if (backendPriority == selectedPriority)
if (availableBackends.Count == 1)
return availableBackends[0];
if (availableBackends.Count > 0)
//Returns a random backend from the list if we have more than one available with the same priority
return availableBackends[new Random().Next(0, availableBackends.Count)];
//If there are no available backends, the request will be sent to the first one
return 0;
}" />
<set-variable name="backendUrl" value="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("url") + "/openai")" />
<set-backend-service base-url="@((string)context.Variables["backendUrl"])" />
<forward-request buffer-request-body="true" />
<!-- In case we got 429 or 5xx from a backend, update the list with its status -->
<when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));
if (retryAfter == -1)
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
if (retryAfter == -1)
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
JObject backend = (JObject)backends[currentBackendIndex];
backend["isThrottling"] = true;
backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<set-variable name="remainingBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int remainingBackends = 0;
for (int i = 0; i < backends.Count; i++)
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
return remainingBackends;
}" />
<base />
<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
<set-header name="x-openai-backendurl" exists-action="override">
<value>@(context.Variables.GetValueOrDefault<string>("backendUrl", "none"))</value>
<base />
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment